# MetaCheX

## Step 1: Data Pre-processing

In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from glob import glob
from pathlib import Path
from sklearn.metrics import roc_curve

PATH_TO_DATA_FOLDER = './data'
NIH_IMAGES = 'nih/images'
NIH_METADATA_PATH = 'nih/Data_Entry_2017.csv'
COVID_19_RADIOGRAPHY_IMAGES = 'COVID-19_Radiography_Dataset/images' ## note labels are in the filenames
COVID_CHESTXRAY_IMAGES = 'covid-chestxray-dataset/images'
COVID_CHESTXRAY_METADATA_PATH = 'covid-chestxray-dataset/metadata.csv'

# SIIM_IMAGE_LABELS = 'siim-covid19-detection/train_image_level.csv'
# SIIM_STUDY_LABELS = 'siim-covid19-detection/train_study_level.csv' ## ???

If labels.csv exists, read from it.

Otherwise, extract filenames and labels for:
- ChestX-ray14 (NIH) dataset
- COVID-19 Radiography Dataset
- covid-chestxray-dataset

and put in labels.csv

In [184]:
## For the SIIM dataset -- just to show that the labelling doesn't make sense (actual siim data not added to repo)
labels_path = os.path.join(PATH_TO_DATA_FOLDER, SIIM_IMAGE_LABELS)
df = pd.read_csv(labels_path)[['id', 'label']]
print(df.head(10))

                   id                                              label
0  000a312787f2_image  opacity 1 789.28836 582.43035 1815.94498 2499....
1  000c3a3f293f_image                                     none 1 0 0 1 1
2  0012ff7358bc_image  opacity 1 677.42216 197.97662 1545.21983 1197....
3  001398f4ff4f_image    opacity 1 2729 2181.33331 3677.00012 2785.33331
4  001bd15d1891_image  opacity 1 623.23328 1050 1337.23328 2156 opaci...
5  0022227f5adf_image  opacity 1 1857.2065 508.30565 2233.23384 907.8...
6  0023f02ae886_image                                     none 1 0 0 1 1
7  002e9b2128d0_image  opacity 1 276.72917 627.42968 1187.31776 2283....
8  00326161e51e_image  opacity 1 542.96057 364.37257 1133.32317999999...
9  0044e449aae1_image  opacity 1 1703.78756 938.29333 2614.8275599999...


In [170]:
labels_path = os.path.join(PATH_TO_DATA_FOLDER, 'labels.csv')
if os.path.isfile(labels_path): ## path exists
    df = pd.read_csv(labels_path)
else:
    df = pd.DataFrame(columns=['image_path', 'label'])

    ## NIH
    full_path = os.path.join(PATH_TO_DATA_FOLDER, NIH_METADATA_PATH)
    df_nih = pd.read_csv(full_path)[['Image Index', 'Finding Labels']]
    df_nih.rename(columns={'Image Index': 'image_path', 'Finding Labels': 'label'}, inplace=True)
    df_nih['label'] = df_nih['label'].str.split('|')
    df_nih['image_path'] = PATH_TO_DATA_FOLDER + '/' + NIH_IMAGES + '/' + df_nih['image_path']
    df = df.append(df_nih)

    ## COVID_CHESTXRAY
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_CHESTXRAY_METADATA_PATH)
    df_cc = pd.read_csv(full_path)[['filename', 'finding']]
    df_cc.rename(columns={'filename': 'image_path', 'finding': 'label'}, inplace=True)
    df_cc = df_cc.drop(df_cc[(df_cc['label'] == 'todo') | (df_cc['label'] == 'Unknown')].index).reset_index(drop=True)
    df_cc['label'] = df_cc['label'].str.split('/')
    ## Remove the label after 'Pneumonia' that specifies type of pneumonia if given
    for i in range(df_cc.shape[0]):
        label = df_cc.at[i, 'label']
        if 'Pneumonia' in label and len(label) > 1:
            p_idx = label.index('Pneumonia')
            label.pop(p_idx + 1)
            #sort the labels to be in alphabetical order
            df_cc.at[i, 'label'] = sorted(label)

    df_cc['image_path'] = PATH_TO_DATA_FOLDER + '/' + COVID_CHESTXRAY_IMAGES + '/' + df_cc['image_path']
    df = df.append(df_cc)

    ## COVID-19 Radiography
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_19_RADIOGRAPHY_IMAGES)
    df_cr = pd.DataFrame(columns=['image_path', 'label'])
    image_lst = sorted(list(glob(f"{full_path}/*"))) ## gets list of all image filepaths
    label_arr = np.array([f[f.rindex('/') + 1:f.rindex('-')] for f in image_lst])
    label_arr = np.where(label_arr == 'COVID', 'COVID-19', label_arr) ## replace COVID with COVID-19 for consistency
    label_arr = np.where(label_arr == 'Viral Pneumonia', 'Pneumonia', label_arr)
    label_arr = np.where(label_arr == 'Normal', 'No Finding', label_arr) ## replace 'Normal' with 'No Finding'
    df_cr['image_path'] = image_lst
    df_cr['label'] = label_arr
    df_cr['label'] = df_cr['label'].str.split(pat='.') ## makes each label a list (random sep so that no split on space)
    df = df.append(df_cr)

    df = df.reset_index(drop=True)
    df.to_csv(labels_path)

Get stats on the data: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [164]:
unique_labels_dict = {} ## keys are str
unique_combos_dict = {} ## keys are tuples of str
for i in range(df.shape[0]):
    label = tuple(df.at[i, 'label'])
    for l in label:
        if l not in unique_labels_dict:
            unique_labels_dict[l] = 0
        unique_labels_dict[l] += 1
    
    if label not in unique_combos_dict:
        unique_combos_dict[label] = 0
    unique_combos_dict[label] += 1

df_label_nums = pd.DataFrame.from_dict(unique_labels_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
df_combo_nums = pd.DataFrame.from_dict(unique_combos_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels: ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label (Top 20)")
print(df_label_nums.head(20))
print("\n")
print("Number of images with each combo label (Top 20)")
print(df_combo_nums.head(20))



Number of total images:  164298
Number of total individual labels:  35
Number of total label combos (includes individual labels):  856
****************************
Number of images with each individual label (Top 20)
                    count
No Finding          70575
Infiltration        19894
Effusion            13317
Atelectasis         11559
Nodule               6331
Lung_Opacity         6012
Mass                 5782
Pneumothorax         5302
Consolidation        4667
COVID-19             4200
Pneumonia            3602
Pleural_Thickening   3385
Cardiomegaly         2776
Emphysema            2516
Edema                2303
Fibrosis             1686
Hernia                227
Pneumocystis           30
Streptococcus          22
Tuberculosis           18


Number of images with each combo label (Top 20)
                                       count
(No Finding,)                          70575
(Infiltration,)                         9547
(Lung_Opacity,)                         6012
(Atelec