# MetaCheX

## Step 1: Data Pre-processing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from glob import glob
from keras.utils.np_utils import to_categorical   
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve

PATH_TO_DATA_FOLDER = './data'
NIH_IMAGES = 'nih/images'
NIH_METADATA_PATH = 'nih/Data_Entry_2017.csv'
COVID_19_RADIOGRAPHY_IMAGES = 'COVID-19_Radiography_Dataset/images' ## note labels are in the filenames
COVID_CHESTXRAY_IMAGES = 'covid-chestxray-dataset/images'
COVID_CHESTXRAY_METADATA_PATH = 'covid-chestxray-dataset/metadata.csv'

IMAGE_SIZE = 224

If labels.csv exists, read from it.

Otherwise, extract filenames and labels for:
- ChestX-ray14 (NIH) dataset
- COVID-19 Radiography Dataset
- covid-chestxray-dataset

and put in labels.csv

In [2]:
labels_path = os.path.join(PATH_TO_DATA_FOLDER, 'labels.csv')
if os.path.isfile(labels_path): ## path exists
    df = pd.read_csv(labels_path)
    df['label_num'] = df.groupby(['label']).ngroup()
    df['label'] = df['label'].apply(lambda x: x[1:-1].split(', ')) ## convert to list of str
else:
    df = pd.DataFrame(columns=['image_path', 'label'])

    ## NIH
    full_path = os.path.join(PATH_TO_DATA_FOLDER, NIH_METADATA_PATH)
    df_nih = pd.read_csv(full_path)[['Image Index', 'Finding Labels']]
    df_nih.rename(columns={'Image Index': 'image_path', 'Finding Labels': 'label'}, inplace=True)
    df_nih['label'] = df_nih['label'].str.split('|')
    df_nih['image_path'] = PATH_TO_DATA_FOLDER + '/' + NIH_IMAGES + '/' + df_nih['image_path']
    df = df.append(df_nih)

    ## COVID_CHESTXRAY
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_CHESTXRAY_METADATA_PATH)
    df_cc = pd.read_csv(full_path)[['filename', 'finding']]
    df_cc.rename(columns={'filename': 'image_path', 'finding': 'label'}, inplace=True)
    df_cc = df_cc.drop(df_cc[(df_cc['label'] == 'todo') | (df_cc['label'] == 'Unknown')].index).reset_index(drop=True)
    df_cc['label'] = df_cc['label'].str.split('/')
    ## Remove the label after 'Pneumonia' that specifies type of pneumonia if given
    for i in range(df_cc.shape[0]):
        label = df_cc.at[i, 'label']
        if 'Pneumonia' in label and len(label) > 1:
            p_idx = label.index('Pneumonia')
            label.pop(p_idx + 1)
            #sort the labels to be in alphabetical order
            df_cc.at[i, 'label'] = sorted(label)

    df_cc['image_path'] = PATH_TO_DATA_FOLDER + '/' + COVID_CHESTXRAY_IMAGES + '/' + df_cc['image_path']
    df = df.append(df_cc)

    ## COVID-19 Radiography
    full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_19_RADIOGRAPHY_IMAGES)
    df_cr = pd.DataFrame(columns=['image_path', 'label'])
    image_lst = sorted(list(glob(f"{full_path}/*"))) ## gets list of all image filepaths
    label_arr = np.array([f[f.rindex('/') + 1:f.rindex('-')] for f in image_lst])
    label_arr = np.where(label_arr == 'COVID', 'COVID-19', label_arr) ## replace COVID with COVID-19 for consistency
    label_arr = np.where(label_arr == 'Viral Pneumonia', 'Pneumonia', label_arr)
    label_arr = np.where(label_arr == 'Normal', 'No Finding', label_arr) ## replace 'Normal' with 'No Finding'
    df_cr['image_path'] = image_lst
    df_cr['label'] = label_arr
    df_cr['label'] = df_cr['label'].str.split(pat='.') ## makes each label a list (random sep so that no split on space)
    df = df.append(df_cr)

    df = df.reset_index(drop=True)
    df.to_csv(labels_path, index=False)
    
    ## Reads in the new csv and adds label_num col (requires label to be str rather than list of str)
    df = pd.read_csv(labels_path)
    df['label_num'] = df.groupby(['label']).ngroup()
    df['label'] = df['label'].apply(lambda x: x[1:-1].split(', ')) ## convert to list of str
    
display(df.head(10))

Unnamed: 0,image_path,label,label_num
0,./data/nih/images/00000001_000.png,['Cardiomegaly'],416
1,./data/nih/images/00000001_001.png,"['Cardiomegaly', 'Emphysema']",390
2,./data/nih/images/00000001_002.png,"['Cardiomegaly', 'Effusion']",388
3,./data/nih/images/00000002_000.png,['No Finding'],827
4,./data/nih/images/00000003_000.png,['Hernia'],777
5,./data/nih/images/00000003_001.png,['Hernia'],777
6,./data/nih/images/00000003_002.png,['Hernia'],777
7,./data/nih/images/00000003_003.png,"['Hernia', 'Infiltration']",772
8,./data/nih/images/00000003_004.png,['Hernia'],777
9,./data/nih/images/00000003_005.png,['Hernia'],777


Get stats on the data: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [3]:
unique_labels_dict = {} ## keys are str
unique_combos_dict = {} ## keys are tuples of str
for i in range(df.shape[0]):
    label = tuple(df.at[i, 'label'])
    for l in label:
        if l not in unique_labels_dict:
            unique_labels_dict[l] = 0
        unique_labels_dict[l] += 1
    
    if label not in unique_combos_dict:
        unique_combos_dict[label] = 0
    unique_combos_dict[label] += 1

df_label_nums = pd.DataFrame.from_dict(unique_labels_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
df_combo_nums = pd.DataFrame.from_dict(unique_combos_dict, orient='index', columns=['count']).sort_values(by=['count'], ascending=False)
print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels: ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label (Top 20)")
display(df_label_nums.head(20))
print("\n")
print("Number of images with each combo label (Top 20)")
display(df_combo_nums.head(20))
print("\n")

## Get number of labels with number of images in each range
## > 10k
## 1k - 10k
## 100 - 1k
## 10 - 100
## < 10
bins =  np.array([0, 10, 100, 1000, 10000, 100000])
df_combo_counts = pd.DataFrame(columns=['count interval', 'number of labels'])
df_combo_counts['count interval'] = ["<= 10", "(10, 100]", "(100, 1k]", "(1k, 10k]", "> 10k"]

df_combo_counts['number of labels'] = [df_combo_nums[df_combo_nums['count'] <= 10].size,
                               df_combo_nums[(df_combo_nums['count'] > 10) & (df_combo_nums['count'] <= 1e2)].size,
                               df_combo_nums[(df_combo_nums['count'] > 1e2) & (df_combo_nums['count'] <= 1e3)].size,
                               df_combo_nums[(df_combo_nums['count'] > 1e3) & (df_combo_nums['count'] <= 1e4)].size,
                               df_combo_nums[df_combo_nums['count'] > 1e4].size
                              ]
    
display(df_combo_counts.head())


Number of total images:  164298
Number of total individual labels:  35
Number of total label combos (includes individual labels):  856
****************************
Number of images with each individual label (Top 20)


Unnamed: 0,count
'No Finding',70575
'Infiltration',19894
'Effusion',13317
'Atelectasis',11559
'Nodule',6331
'Lung_Opacity',6012
'Mass',5782
'Pneumothorax',5302
'Consolidation',4667
'COVID-19',4200




Number of images with each combo label (Top 20)


Unnamed: 0,count
"('No Finding',)",70575
"('Infiltration',)",9547
"('Lung_Opacity',)",6012
"('Atelectasis',)",4215
"('Effusion',)",3955
"('COVID-19',)",3616
"('Nodule',)",2705
"('Pneumothorax',)",2194
"('Mass',)",2139
"('Pneumonia',)",1766






Unnamed: 0,count interval,number of labels
0,<= 10,642
1,"(10, 100]",152
2,"(100, 1k]",46
3,"(1k, 10k]",15
4,> 10k,1


### Convert images to fixed-size arrays

In [None]:
def load_data(df):
    """
    df: DataFrame of image paths, label (list of str) and label num
    """
    image_arrays = []
    image_labels = df['label_num'].to_numpy()
    for index, row in df.iterrows():
        image = load_img(row['image_path'])
        image_arr = img_to_array(image)
        image_arr_resized = cv2.resize(image_arr, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LANCZOS4)
        image_arr_norm = image_arr_resized / 255 ## pixels in [0, 255] -- normalize to [0, 1]
        image_arrays.append(image_arr_norm)

    all_images = np.stack(images)
    all_labels = to_categorical(image_labels, num_classes=image_labels.max() + 1) # one-hot
    return all_images, all_labels

## Preview first image and label
all_images, all_labels = load_data(df)
plt.imshow(all_images[0])
plt.xlabel(all_labels[0])

Save the images and labels

In [None]:
## Comment this out after running for the first time
saved_data_path='saved_data' 
with open(saved_data_path, 'wb') as f:
    np.save(f, all_images)
    np.save(f, all_labels)

Load images and labels

In [None]:
saved_data_path='saved_data'
with open(saved_data_path, 'rb') as f:
    all_images = np.load(f)
    all_labels = np.load(f)

## Step 2: Pre-Trained CheXNet

Note: CheXNet = DenseNet121 trained on ChestX-ray14 dataset (multi-task binary classification)

Pre-trained weights: https://github.com/brucechou1983/CheXNet-Keras

In [18]:
def load_chexnet_pretrained(class_names=np.arange(14), weights_path='chexnet_weights.h5', 
                            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)):

    img_input = tf.keras.layers.Input(shape=input_shape)
    base_model = tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None, 
                                                            input_tensor=img_input, pooling='avg')


    x = base_model.output
    predictions = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = tf.keras.models.Model(inputs=img_input, outputs=predictions)
    model.load_weights(weights_path)

    return model


def load_chexnet(output_dim, mode=0):
    """
    output_dim: dimension of output (either for prediction or embedding dim)
    mode: prediction (0), nearest neighbours (1)
    """
    
    base_model_old = load_chexnet_pretrained()
    x = base_model_old.layers[-2].output ## remove old prediction layer
    
    ## The prediction/embedding head can be more complicated if you want
    if mode == 0: ## prediction
        predictions = tf.keras.layers.Dense(output_dim, activation='softmax', name='prediction')(x)
        chexnet = tf.keras.models.Model(inputs=base_model_old.inputs, outputs=predictions)
        
    else: ## nearest neighbours -- get embedding
        embeddings = tf.keras.layers.Dense(output_dim, name='embedding')(x)
        chexnet = tf.keras.models.Model(inputs=base_model_old.inputs, outputs=embeddings)
    
    return chexnet
    
chexnet = load_chexnet(20)
print(chexnet.summary())

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d_14 (ZeroPadding2 (None, 230, 230, 3)  0           input_8[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d_14[0][0]          
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
___________________________________________________________________________________________

Total params: 7,058,004
Trainable params: 6,974,356
Non-trainable params: 83,648
__________________________________________________________________________________________________
None


### Class balancing -- for finetuned CheXNet baseline and finetuned CheXNet w/ supervised contrastive learning
Oversampling and undersampling

In [None]:
## TODO
balanced_data =
balanced_labels = 

### Data split -- note that the split on nih data must coincide with pre-trained split

In [None]:
## TODO

### Train baseline -- cross-entropy

In [None]:
train_x, train_y, val_x, val_y, test_x, test_y = train_val_test_split(balanced_data, balanced_labels)

output_dim = ..
chexnet_ce = load(output_dim)
chexnet_ce.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

batch_size = ..
epochs = ..
chexnet_ce.fit(x=train_x, y=train_y,
              validation_data=(val_x, val_y),
              batch_size=,
              epochs=)

### Evaluate CE baseline

In [None]:
print(chexnet_ce.evaluate(test_x, test_y))

### Train baseline -- nearest neighbours

In [1]:
embedding_dim = ..
chexnet_nn = load(embedding_dim, mode=1)

## TODO

SyntaxError: invalid syntax (2568707853.py, line 1)