In [1]:
import pandas as pd
import os
import cv2
import glob
from IPython.display import display, clear_output

In [2]:
def format_condition(cond):
    if cond == 'Normal':
        return 'normal'
    elif cond == 'Pneumonia':
        return 'pneumonia'
    return cond

def format_condition_back(cond):
    if cond == 'normal':
        return 'Normal'
    elif cond == 'pneumonia':
        return 'Pneumonia'
    return cond

In [3]:
df_ehl = pd.DataFrame(columns=['id', 'type', 'source', 'condition', 'path'], dtype=object)

for split in ['train', 'test']:
    for cond in ['COVID-19', 'Pneumonia', 'Normal']:
        for filename in glob.glob('ehl_data/{}/{}/*.*'.format(split, cond)):
            if filename.endswith(('.jpg', '.jpeg', '.png')):
                df_ehl = df_ehl.append({'id': '{}_{}_{}'.format(split, cond, os.path.basename(filename)),
                                        'type': split,
                                        'source': 'ehl',
                                        'condition': format_condition(cond),
                                        'path': os.path.basename(filename)}, ignore_index=True)

In [6]:
df_ehl.groupby(['type', 'condition']).agg({
    'id': 'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,id
type,condition,Unnamed: 2_level_1
test,COVID-19,100
test,normal,1700
test,pneumonia,97
train,COVID-19,189
train,normal,198
train,pneumonia,21


In [4]:
data = pd.read_csv('labels/train_COVIDx8A.txt', sep="GGGG", header=None)
data.columns = ["line"]
data['type'] = 'train'

data1 = pd.read_csv('labels/test_COVIDx8A.txt', sep="GGGG", header=None)
data1.columns = ["line"]
data1['type'] = 'test'

df_covidx = pd.concat([data, data1])

df_covidx['source'] = df_covidx['line'].apply(lambda x: x.split(' ')[-1])
df_covidx['condition'] = df_covidx['line'].apply(lambda x: x.split(' ')[-2])
df_covidx['path'] = df_covidx['line'].apply(lambda x: x.split(' ')[-3])
df_covidx['id'] = df_covidx['source'].str.cat(df_covidx['path'])

  """Entry point for launching an IPython kernel.
  """
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [5]:
df_ehl[df_ehl['path'] == 'covid-19-pneumonia-22-day2-pa.png']

Unnamed: 0,id,type,source,condition,path
3,train_COVID-19_covid-19-pneumonia-22-day2-pa.png,train,ehl,COVID-19,covid-19-pneumonia-22-day2-pa.png


In [6]:
df_covidx[df_covidx['path'] == 'covid-19-pneumonia-22-day2-pa.png']

Unnamed: 0,line,type,source,condition,path,id
152,164 covid-19-pneumonia-22-day2-pa.png COVID-19...,train,cohen,COVID-19,covid-19-pneumonia-22-day2-pa.png,cohencovid-19-pneumonia-22-day2-pa.png


In [7]:
def get_resolutions_ehl(image):
    img_path = os.path.join('ehl_data/{}/{}'.format(image['type'], format_condition_back(image['condition'])), image['path'])
    img = cv2.imread(img_path)
    return "{} {}".format(img.shape[0], img.shape[1])

def get_resolutions_covidx(image):
    img_path = os.path.join('data/{}'.format(image['type']), image['path'])
    img = cv2.imread(img_path)
    return "{} {}".format(img.shape[0], img.shape[1])

In [8]:
df_ehl['resolutions'] = df_ehl.apply(get_resolutions_ehl, axis=1)
df_covidx['resolutions'] = df_covidx.apply(get_resolutions_covidx, axis=1)

In [9]:
duplicates = pd.DataFrame(columns=['id', 'type', 'source', 'condition', 'path'], dtype=object)
total_images = df_ehl.shape[0]

for i, row_ehl in df_ehl.iterrows():
    ehl_path = os.path.join('ehl_data/{}/{}'.format(row_ehl['type'], format_condition_back(row_ehl['condition'])), row_ehl['path'])
    ehl_img = cv2.imread(ehl_path)
    temp = df_covidx[df_covidx['resolutions'] == row_ehl['resolutions']].copy()
    for j, row_covidx in temp.iterrows():
        covidx_path = os.path.join('data/{}'.format(row_covidx['type']), row_covidx['path'])
        covidx_img = cv2.imread(covidx_path)
        if np.array_equal(ehl_img, covidx_img):
            duplicates = duplicates.append(row_ehl, ignore_index=True)
            duplicates = duplicates.append(row_covidx, ignore_index=True)
    clear_output(wait=True)
    display('Finished {}/{}. Duplicates found: {}.'.format(i+1, total_images, int(len(duplicates)/2)))

'Finished 2305/2305. Duplicates found: 102.'

In [10]:
out_ehl = duplicates[duplicates['source'] == 'ehl'].reset_index()[['path']]

In [11]:
out_ehl.sample(5)

Unnamed: 0,path
90,yxppt-2020-02-19_00-51-27_287214-day8.jpg
24,kjr-21-e24-g001-l-a.jpg
37,ryct.2020200034.fig2.jpeg
42,radiol.2020201160.fig2c.jpeg
60,1.CXRCTThoraximagesofCOVID-19fromSingapore.pdf...


In [12]:
#out_ehl.to_csv('ehl_cohen_images.txt', index=False, header=False)

In [13]:
duplicates['source'].unique()

array(['ehl', 'cohen'], dtype=object)

In [14]:
df_ehl[df_ehl['path'] == 'COVID-1912132020 (89).jpg']

Unnamed: 0,id,type,source,condition,path,resolutions
58,train_COVID-19_COVID-1912132020 (89).jpg,train,ehl,COVID-19,COVID-1912132020 (89).jpg,2000 2000


In [15]:
df_ehl['resolution_x'] = df_ehl['resolutions'].apply(lambda x: x.split('\xa0')[1])
df_ehl['resolution_y'] = df_ehl['resolutions'].apply(lambda x: x.split('\xa0')[0])

In [16]:
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from matplotlib.colors import LinearSegmentedColormap

white_viridis = LinearSegmentedColormap.from_list('white_viridis', [
    (0, '#ffffff'),
    (1e-20, '#fde624'),
    (0.2, '#78d151'),
    (0.4, '#21a784'),
    (0.6, '#2a788e'),
    (0.8, '#404388'),
    (1, '#440053'),
], N=data.shape[0])

x = df_ehl['resolution_x'].astype(float).to_numpy()
y = df_ehl['resolution_y'].astype(float).to_numpy()

plt.figure(figsize=(10,7.5))

plt.title('E*HealthLine Image Resolution Distributions')
plt.xlabel('Width')
plt.ylabel('Height')

plt.hist2d(x, y, (50, 50), cmap=white_viridis)

plt.grid()
plt.colorbar()

plt.savefig('ehl_img_pre_cleanup.png')
plt.show()

<Figure size 1000x750 with 2 Axes>

In [17]:
df_ehl.groupby('resolutions').agg({
    'id': 'count'
}).sort_values(by='id', ascending=False)

Unnamed: 0_level_0,id
resolutions,Unnamed: 1_level_1
320 390,843
320 320,473
320 389,173
320 369,103
320 371,54
...,...
2771 2979,1
2800 3408,1
2840 2972,1
2880 2376,1


In [18]:
df_ehl.groupby('resolutions').agg({
    'id': 'count'
})['id'].sum()

2305

In [20]:
df_ehl.head()

Unnamed: 0,id,type,source,condition,path,resolutions,resolution_x,resolution_y
0,train_COVID-19_COVID-19 _02132021(8).jpg,train,ehl,COVID-19,COVID-19 _02132021(8).jpg,400 339,339,400
1,train_COVID-19_COVID-19 _02132021(10).jpg,train,ehl,COVID-19,COVID-19 _02132021(10).jpg,768 936,936,768
2,train_COVID-19_COVID-19 _02132021(47).jpg,train,ehl,COVID-19,COVID-19 _02132021(47).jpg,651 913,913,651
3,train_COVID-19_covid-19-pneumonia-22-day2-pa.png,train,ehl,COVID-19,covid-19-pneumonia-22-day2-pa.png,1395 1205,1205,1395
4,train_COVID-19_8FDE8DBA-CFBD-4B4C-B1A4-6F36A93...,train,ehl,COVID-19,8FDE8DBA-CFBD-4B4C-B1A4-6F36A93B7E87.jpeg,657 657,657,657


In [22]:
df_ehl.groupby(['type', 'condition']).agg({
    'id': 'count'
})

Unnamed: 0_level_0,Unnamed: 1_level_0,id
type,condition,Unnamed: 2_level_1
test,COVID-19,100
test,normal,1700
test,pneumonia,97
train,COVID-19,189
train,normal,198
train,pneumonia,21


In [23]:
df_ehl.shape

(2305, 8)

In [25]:
# create total ehl text file for inference

ehl = pd.DataFrame(columns=['id', 'name', 'class', 'source'], dtype=object)

for split in ['train', 'test']:
    for ehl_dir in os.listdir('ehl_data/{}'.format(split)):
        if ehl_dir in ['COVID-19', 'Pneumonia', 'Normal']:
            path = 'ehl_data/{}/{}'.format(split, ehl_dir)
            images = os.listdir('ehl_data/{}/{}'.format(split, ehl_dir))
            for idx, image in enumerate(images):
                path_to_image = os.path.join(path, image)
                ehl = ehl.append({
                    'id': 'ehl_{}_{}_{}'.format(split, ehl_dir, idx),
                    'name': image.replace(" ", ""),
                    'class': format_condition(ehl_dir),
                    'source': 'ehl',
                    'split': split
                }, ignore_index=True)
                    
ehl.to_csv('labels/total_ehl.txt', sep=' ', index=False, header=False)