# MetaCheX

## Step 1: Data Pre-processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from glob import glob
from keras.utils.np_utils import to_categorical   
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve

PATH_TO_DATA_FOLDER = './data'
NIH_IMAGES = 'nih/images'
NIH_METADATA_PATH = 'nih/Data_Entry_2017.csv'
COVID_19_RADIOGRAPHY_IMAGES = 'COVID-19_Radiography_Dataset/images' ## note labels are in the filenames
COVID_CHESTXRAY_IMAGES = 'covid-chestxray-dataset/images'
COVID_CHESTXRAY_METADATA_PATH = 'covid-chestxray-dataset/metadata.csv'

IMAGE_SIZE = 224

If labels.csv exists, read from it.

Otherwise, extract filenames and labels for:
- ChestX-ray14 (NIH) dataset
- COVID-19 Radiography Dataset
- covid-chestxray-dataset

and put in labels.csv

In [104]:
labels_path = os.path.join(PATH_TO_DATA_FOLDER, 'labels.csv')
if not os.path.isfile(labels_path): ## path exists
    df = pd.DataFrame(columns=['image_path', 'label'])

    ## NIH
    full_path = os.path.join(PATH_TO_DATA_FOLDER, NIH_METADATA_PATH)
    df_nih = pd.read_csv(full_path)[['Image Index', 'Finding Labels']]
    df_nih.rename(columns={'Image Index': 'image_path', 'Finding Labels': 'label'}, inplace=True)
    df_nih['label'] = df_nih['label'].str.strip().str.split('|')
    df_nih['image_path'] = PATH_TO_DATA_FOLDER + '/' + NIH_IMAGES + '/' + df_nih['image_path']
    df = df.append(df_nih)

    ## COVID_CHESTXRAY
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_CHESTXRAY_METADATA_PATH)
    df_cc = pd.read_csv(full_path)[['filename', 'finding']]
    df_cc.rename(columns={'filename': 'image_path', 'finding': 'label'}, inplace=True)
    df_cc = df_cc.drop(df_cc[(df_cc['label'] == 'todo') | (df_cc['label'] == 'Unknown')].index).reset_index(drop=True)
    df_cc['label'] = df_cc['label'].str.strip().str.split('/')
    ## Remove the label after 'Pneumonia' that specifies type of pneumonia if given
    for i in range(df_cc.shape[0]):
        label = df_cc.at[i, 'label']
        if 'Pneumonia' in label and len(label) > 1:
            p_idx = label.index('Pneumonia')
            label.pop(p_idx + 1)
            #sort the labels to be in alphabetical order
            df_cc.at[i, 'label'] = sorted(label)
    
    df_cc['image_path'] = PATH_TO_DATA_FOLDER + '/' + COVID_CHESTXRAY_IMAGES + '/' + df_cc['image_path']
    df = df.append(df_cc)

    ## COVID-19 Radiography
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_19_RADIOGRAPHY_IMAGES)
    df_cr = pd.DataFrame(columns=['image_path', 'label'])
    image_lst = sorted(list(glob(f"{full_path}/*"))) ## gets list of all image filepaths
    label_arr = np.array([f[f.rindex('/') + 1:f.rindex('-')] for f in image_lst])
    label_arr = np.where(label_arr == 'COVID', 'COVID-19', label_arr) ## replace COVID with COVID-19 for consistency
    label_arr = np.where(label_arr == 'Viral Pneumonia', 'Pneumonia', label_arr)
    label_arr = np.where(label_arr == 'Normal', 'No Finding', label_arr) ## replace 'Normal' with 'No Finding'
    df_cr['image_path'] = image_lst
    df_cr['label'] = label_arr
    df_cr['label'] = df_cr['label'].str.strip().str.split(pat='.') ## makes each label a list (random sep so that no split on space)
    df = df.append(df_cr)

    df = df.reset_index(drop=True)
    df['label'] = df['label'].sort_values().apply(lambda x: sorted(x)) ## final sort just in case
    df['label_str'] = df['label'].str.join('|')
    df.to_csv(labels_path, index=False, columns=['image_path', 'label_str'])
    
## Reads in csv file and adds label_num col (requires label to be str rather than list of str)
df = pd.read_csv(labels_path)
## Add back 'label' column
df['label'] = df['label_str'].str.split(pat='|')
## Get combo label (for multiclass classification)
df['label_num_multi'] = df.groupby(['label_str']).ngroup()

display(df.head(10))

Unnamed: 0,image_path,label_str,label,label_num_multi
0,./data/nih/images/00000001_000.png,Cardiomegaly,[Cardiomegaly],310
1,./data/nih/images/00000001_001.png,Cardiomegaly|Emphysema,"[Cardiomegaly, Emphysema]",387
2,./data/nih/images/00000001_002.png,Cardiomegaly|Effusion,"[Cardiomegaly, Effusion]",355
3,./data/nih/images/00000002_000.png,No Finding,[No Finding],802
4,./data/nih/images/00000003_000.png,Hernia,[Hernia],750
5,./data/nih/images/00000003_001.png,Hernia,[Hernia],750
6,./data/nih/images/00000003_002.png,Hernia,[Hernia],750
7,./data/nih/images/00000003_003.png,Hernia|Infiltration,"[Hernia, Infiltration]",751
8,./data/nih/images/00000003_004.png,Hernia,[Hernia],750
9,./data/nih/images/00000003_005.png,Hernia,[Hernia],750


Get stats on the data: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [106]:
unique_labels_dict = {} ## keys are str
unique_combos_dict = {} ## keys are tuples of str
for i in range(df.shape[0]):
    labels = df.at[i, 'label']
    for l in labels:
        if l not in unique_labels_dict:
            unique_labels_dict[l] = 0
        unique_labels_dict[l] += 1
    
    label_str = df.at[i, 'label_str']
    if label_str not in unique_combos_dict:
        unique_combos_dict[label_str] = 0
    unique_combos_dict[label_str] += 1

df_label_nums = pd.DataFrame.from_dict(unique_labels_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
df_combo_nums = pd.DataFrame.from_dict(unique_combos_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels: ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label (Top 20)")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Top 20)")
display(df_combo_nums.tail(20))
print("\n")

## Get number of labels with number of images in each range
## > 10k
## 1k - 10k
## 100 - 1k
## 10 - 100
## < 10
bins =  np.array([0, 10, 100, 1000, 10000, 100000])
df_combo_counts = pd.DataFrame(columns=['count interval', 'number of labels'])
df_combo_counts['count interval'] = ["< 5", "[5, 100)", "[100, 1k)", "[1k, 10k)", ">= 10k"]

df_combo_counts['number of labels'] = [
                               df_combo_nums[df_combo_nums['count'] < 5].size,
                               df_combo_nums[(df_combo_nums['count'] >= 5) & (df_combo_nums['count'] < 1e2)].size,
                               df_combo_nums[(df_combo_nums['count'] >= 1e2) & (df_combo_nums['count'] < 1e3)].size,
                               df_combo_nums[(df_combo_nums['count'] >= 1e3) & (df_combo_nums['count'] < 1e4)].size,
                               df_combo_nums[df_combo_nums['count'] >= 1e4].size
                              ]
    
display(df_combo_counts.head())

Number of total images:  164298
Number of total individual labels:  35
Number of total label combos (includes individual labels):  821
****************************
Number of images with each individual label (Top 20)


Unnamed: 0,count
No Finding,70575
Infiltration,19894
Effusion,13317
Atelectasis,11559
Nodule,6331
Lung_Opacity,6012
Mass,5782
Pneumothorax,5302
Consolidation,4667
COVID-19,4200




Number of images with each combo label (Top 20)


Unnamed: 0,count
Consolidation|Effusion|Infiltration|Mass|Pneumothorax,1
Atelectasis|Consolidation|Effusion|Nodule|Pneumothorax,1
Effusion|Emphysema|Infiltration|Mass,1
Atelectasis|Consolidation|Effusion|Emphysema|Mass|Pneumothorax,1
Cardiomegaly|Effusion|Pleural_Thickening|Pneumothorax,1
Atelectasis|Effusion|Fibrosis|Infiltration|Nodule,1
Effusion|Fibrosis|Mass|Pleural_Thickening,1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Pleural_Thickening,1
Atelectasis|Mass|Nodule|Pneumonia,1
Atelectasis|Emphysema|Fibrosis|Infiltration,1






Unnamed: 0,count interval,number of labels
0,< 5,493
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


Remove all classes with less than 5 examples

In [114]:
df_combos_exclude = df_combo_nums[df_combo_nums['count'] < 5].reset_index().rename(columns={'index': 'label_str'})
display(df_combos_exclude.head(10))

df_condensed = df[~df['label_str'].isin(df_combos_exclude['label_str'])].reset_index(drop=True)
display(df_condensed.tail(10))

Unnamed: 0,label_str,count
0,Atelectasis|Emphysema|Pleural_Thickening,4
1,Atelectasis|Effusion|Mass|Pneumothorax,4
2,Infiltration|Pleural_Thickening|Pneumonia,4
3,Consolidation|Edema|Effusion|Infiltration|Mass,4
4,Consolidation|Fibrosis|Mass|Pleural_Thickening,4
5,Atelectasis|Consolidation|Edema|Effusion|Infil...,4
6,Emphysema|Fibrosis|Pneumothorax,4
7,Atelectasis|Cardiomegaly|Effusion|Mass,4
8,Atelectasis|Infiltration|Nodule|Pleural_Thicke...,4
9,Atelectasis|Effusion|Hernia,4


Unnamed: 0,image_path,label_str,label,label_num_multi
133308,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133309,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133310,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133311,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133312,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133313,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133314,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133315,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133316,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814
133317,./data/COVID-19_Radiography_Dataset/images/Vir...,Pneumonia,[Pneumonia],814


### Dataloader

In [None]:
## Not gonna do this
# def load_data(df):
#     """
#     df: DataFrame of image paths, label (list of str) and label num
#     """
#     image_arrays = []
#     image_labels = df['label_num'].to_numpy()
#     for index, row in df.iterrows():
#         image = load_img(row['image_path'])
#         image_arr = img_to_array(image)
#         image_arr_resized = cv2.resize(image_arr, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LANCZOS4)
#         image_arr_norm = image_arr_resized / 255 ## pixels in [0, 255] -- normalize to [0, 1]
#         image_arrays.append(image_arr_norm)

#     all_images = np.stack(images)
#     all_labels = to_categorical(image_labels, num_classes=image_labels.max() + 1) # one-hot
#     return all_images, all_labels

## Step 2: Finetuned CheXNet Baseline

Note: CheXNet = DenseNet121 trained on ChestX-ray14 dataset (multi-task binary classification)

Pre-trained weights: https://github.com/brucechou1983/CheXNet-Keras

In [None]:
def load_chexnet_pretrained(class_names=np.arange(14), weights_path='chexnet_weights.h5', 
                            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)):

    img_input = tf.keras.layers.Input(shape=input_shape)
    base_model = tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None, 
                                                            input_tensor=img_input, pooling='avg')


    x = base_model.output
    predictions = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = tf.keras.models.Model(inputs=img_input, outputs=predictions)
    model.load_weights(weights_path)

    return model


def load_chexnet(output_dim):
    """
    output_dim: dimension of output
    """
    
    base_model_old = load_chexnet_pretrained()
    x = base_model_old.layers[-2].output ## remove old prediction layer
    
    ## The prediction head can be more complicated if you want
    predictions = tf.keras.layers.Dense(output_dim, activation='softmax', name='prediction')(x)
    chexnet = tf.keras.models.Model(inputs=base_model_old.inputs, outputs=predictions)
    return chexnet
    
chexnet = load_chexnet(34)
print(chexnet.summary())

### Class balancing -- for finetuned CheXNet baseline and finetuned CheXNet w/ supervised contrastive learning
Data augmentation (minor rotations, flips) and oversampling of minority classes (classes w/ < 1k examples); undersampling majority classes (classes w/ > 10k examples)

In [7]:
## TODO

### Data split -- note that the split on NIH data must coincide with pre-trained split (or else leakage)

In [None]:
## TODO

### Train baseline -- multi-task binary classification

In [None]:
train_x, train_y, val_x, val_y, test_x, test_y = train_val_test_split(balanced_data, balanced_labels)

output_dim = ..
chexnet_ce = load(output_dim)
chexnet_ce.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

batch_size = ..
epochs = ..
chexnet_ce.fit(x=train_x, y=train_y,
              validation_data=(val_x, val_y),
              batch_size=,
              epochs=)

### Evaluate CE baseline

In [None]:
print(chexnet_ce.evaluate(test_x, test_y))