# MetaCheX

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
import tensorflow_addons as tfa
from glob import glob
# from keras.utils.np_utils import to_categorical   
# from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# metachex materials
from metachex.configs.config import *
from metachex.dataloader import MetaChexDataset
from metachex.loss import Losses

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"]=""
tf.test.is_gpu_available()
physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Instantiate Dataset and Splits

In [3]:
# Instantiate dataset
dataset = MetaChexDataset()
# Get class weights (feed this into model construction for weighted loss)
# indiv_class_weights, combo_class_weights = dataset.get_class_weights()
# Grab training dataset
train_ds = dataset.train_ds

[INFO] pre-processing
Data already processed. Loading from save /home/elainesui/MetaCheX/data/data.pkl
[INFO] truncating dataset
[INFO] constructing tf train/val/test vars


2021-11-06 14:43:24.261239: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 14:43:24.262071: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 14:43:24.262775: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 14:43:24.263587: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 14:43:24.264375: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

NIH ds sizes [56603, 3485, 853]
Other ds sizes [0, 3787, 7994]
56603
7272
8847
Total samples:  72722
True split:  [0.778347680206815, 0.09999724980061055, 0.12165506999257446]
[INFO] shuffle & batch
[INFO] initialized


In [4]:
# elaine_repo_path = '/home/elainesui/MetaCheX'
# os.chdir(elaine_repo_path)

In [5]:
images, labels = next(iter(train_ds))
images.shape

2021-11-06 14:43:26.324453: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-06 14:43:26.701298: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


TensorShape([8, 224, 224, 3])

### Get data stats: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [6]:
# Check out the data
unique_labels_dict, df_combo_counts, df_label_nums, df_combo_nums = dataset.get_data_stats(dataset.df)

print("Number of total images: ", df_combo_nums['count'].sum())
print("Number of total individual labels (includes 'No Finding'): ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Bottom 20)")
display(df_combo_nums.tail(20))
print("\n")
display(df_combo_counts.head())

Number of total images:  103681
Number of total individual labels (includes 'No Finding'):  35
Number of total label combos (includes individual labels):  821
****************************
Number of images with each individual label


Unnamed: 0,count
Infiltration,19894
Effusion,13317
Atelectasis,11559
No Finding,10000
Nodule,6331
Lung_Opacity,6012
Mass,5782
Pneumothorax,5302
Consolidation,4667
COVID-19,4179




Number of images with each combo label (Bottom 20)


Unnamed: 0,count
Atelectasis|Consolidation|Effusion|Infiltration|Pneumonia,1
Atelectasis|Consolidation|Effusion|Infiltration|Nodule|Pleural_Thickening,1
Atelectasis|Mass|Nodule|Pneumonia,1
Effusion|Mass|Pneumonia|Pneumothorax,1
Atelectasis|Consolidation|Edema|Effusion|Infiltration|Pneumonia,1
Atelectasis|Consolidation|Effusion|Emphysema|Mass|Pneumothorax,1
Consolidation|Pneumonia|Pneumothorax,1
Atelectasis|Effusion|Fibrosis|Infiltration|Nodule,1
Effusion|Fibrosis|Mass|Pleural_Thickening,1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Pleural_Thickening,1






Unnamed: 0,count interval,number of labels
0,< 5,493
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


### Get updated data stats

In [7]:
print("Stats for condensed dataset")
print("---------------------------")
unique_labels_dict, df_combo_counts, df_label_nums, df_combo_nums = dataset.get_data_stats(dataset.df_condensed)
print(dataset.df_condensed['label_multitask'][0].shape)

print("Number of total images: ", df_combo_nums['count'].sum())
print("Number of total individual labels (includes 'No Finding'): ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Bottom 20)")
display(df_combo_nums.tail(20))
print("\n")
display(df_combo_counts.head())

## Just making sure that the weights match with the correct labels
indiv = df_label_nums
indiv_weights = (1 / indiv['count']) * (indiv['count'].sum() / indiv.shape[0])
indiv_weights = indiv_weights.sort_index()
indiv_weights = indiv_weights.drop(['No Finding'])
display(indiv_weights)
indiv_class_weights = dict(list(enumerate(indiv_weights.values)))
display(indiv_class_weights)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order
print(unique_labels)

Stats for condensed dataset
---------------------------
(27,)
Number of total images:  100224
Number of total individual labels (includes 'No Finding'):  28
Number of total label combos (includes individual labels):  328
****************************
Number of images with each individual label


Unnamed: 0,count
Infiltration,19510
Effusion,12915
Atelectasis,11201
No Finding,10000
Nodule,6087
Lung_Opacity,6012
Mass,5470
Pneumothorax,5062
Consolidation,4398
COVID-19,4179




Number of images with each combo label (Bottom 20)


Unnamed: 0,count
Emphysema|Pneumonia,5
Consolidation|Effusion|Infiltration|Pneumothorax,5
Atelectasis|Nodule|Pneumonia,5
Effusion|Emphysema|Nodule,5
Atelectasis|Consolidation|Effusion|Emphysema,5
Cardiomegaly|Effusion|Emphysema|Pneumothorax,5
Consolidation|Effusion|Infiltration|Pneumonia,5
Cardiomegaly|Effusion|Infiltration|Pleural_Thickening,5
Atelectasis|Effusion|Nodule|Pleural_Thickening,5
Emphysema|Mass|Nodule,5






Unnamed: 0,count interval,number of labels
0,< 5,0
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


Atelectasis             0.319563
COVID-19                0.856528
Cardiomegaly            1.393316
Consolidation           0.813876
Edema                   1.698021
Effusion                0.277153
Emphysema               1.536235
Fibrosis                2.321290
Hernia                 20.222760
Infiltration            0.183466
Influenza             715.885714
Klebsiella            357.942857
Legionella            357.942857
Lung_Opacity            0.595381
MERS-CoV              357.942857
Mass                    0.654375
Mycoplasma            325.402597
Nocardia              447.428571
Nodule                  0.588045
Pleural_Thickening      1.147621
Pneumocystis          119.314286
Pneumonia               1.053083
Pneumothorax            0.707117
SARS                  223.714286
Streptococcus         162.701299
Tuberculosis          198.857143
Varicella             596.571429
Name: count, dtype: float64

{0: 0.31956330429681024,
 1: 0.8565275356373706,
 2: 1.3933159094700551,
 3: 0.8138764373416488,
 4: 1.6980211439414477,
 5: 0.27715281234445,
 6: 1.536235438381361,
 7: 2.321289605336298,
 8: 20.222760290556902,
 9: 0.18346635425056748,
 10: 715.8857142857144,
 11: 357.9428571428572,
 12: 357.9428571428572,
 13: 0.5953806672369547,
 14: 357.9428571428572,
 15: 0.6543745103160095,
 16: 325.40259740259745,
 17: 447.42857142857144,
 18: 0.5880447792719848,
 19: 1.147620574359914,
 20: 119.31428571428572,
 21: 1.0530828394906064,
 22: 0.7071174578088841,
 23: 223.71428571428572,
 24: 162.70129870129873,
 25: 198.85714285714286,
 26: 596.5714285714286}

['Atelectasis', 'COVID-19', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Influenza', 'Klebsiella', 'Legionella', 'Lung_Opacity', 'MERS-CoV', 'Mass', 'Mycoplasma', 'Nocardia', 'Nodule', 'Pleural_Thickening', 'Pneumocystis', 'Pneumonia', 'Pneumothorax', 'SARS', 'Streptococcus', 'Tuberculosis', 'Varicella']


## Step 2: Finetuned CheXNet Baseline

Note: CheXNet = DenseNet121 trained on ChestX-ray14 dataset (multi-task binary classification)

Pre-trained weights: https://github.com/brucechou1983/CheXNet-Keras

In [8]:
# Magic nunbers
batchsize=8
num_classes = 27
def load_chexnet_pretrained(class_names=np.arange(14), weights_path='chexnet_weights.h5', 
                            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)):

    img_input = tf.keras.layers.Input(shape=input_shape)
    base_model = tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None, 
                                                            input_tensor=img_input, pooling='avg')
    base_model.trainable = False


    x = base_model.output
    predictions = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = tf.keras.models.Model(inputs=img_input, outputs=predictions)
    model.load_weights(weights_path)

    return model


def load_chexnet(output_dim):
    """
    output_dim: dimension of output
    """
    
    base_model_old = load_chexnet_pretrained()
    x = base_model_old.layers[-2].output ## remove old prediction layer
    
    ## The prediction head can be more complicated if you want
    predictions = tf.keras.layers.Dense(output_dim, name='prediction', activation='sigmoid')(x)
    chexnet = tf.keras.models.Model(inputs=base_model_old.inputs,outputs=predictions)
    return chexnet
    
chexnet = load_chexnet(num_classes)
print(chexnet.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d[0][0]             
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

Total params: 7,065,179
Trainable params: 27,675
Non-trainable params: 7,037,504
__________________________________________________________________________________________________
None


In [9]:
chexnet.layers[-1].name

'prediction'

In [10]:
train_ds

<BatchDataset shapes: ((None, 224, 224, 3), (None, 27)), types: (tf.float32, tf.uint8)>

### Train baseline -- multi-task binary classification

In [11]:
checkpoint_path = "training_progress/cp.ckpt" # path for saving model weights
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [12]:
class_weights, _, _ = dataset.get_class_weights(one_cap=False)

loss_fn = Losses(class_weights, batch_size=dataset.batch_size)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order

output_dim = len(unique_labels)
chexnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
                  loss=loss_fn.weighted_binary_crossentropy(),
#                 loss='binary_crossentropy',
                metrics=[tf.keras.metrics.AUC(multi_label=True),  'binary_accuracy', 'accuracy', tfa.metrics.F1Score(average='micro',num_classes=num_classes),  \
                         tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
#                 run_eagerly=True)

epochs = 150
hist = chexnet.fit(dataset.train_ds,
            validation_data=dataset.val_ds,
            epochs=epochs,
            steps_per_epoch=dataset.steps_per_epoch, ## size(train_ds) * 0.125 * 0.1
            batch_size=dataset.batch_size, ## 8
            callbacks=[cp_callback])

with open('./trainHistoryDict', 'wb') as file_pi:
        pickle.dump(hist.history, file_pi)

Epoch 1/150


2021-11-06 14:43:40.030507: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


  8/707 [..............................] - ETA: 2:07 - loss: 79.1641 - auc: 0.2098 - binary_accuracy: 0.7164 - accuracy: 0.0000e+00 - f1_score: 0.0000e+00 - precision: 0.0244 - recall: 0.0990








Epoch 00001: saving model to training_progress/cp.ckpt
Epoch 2/150




Epoch 00002: saving model to training_progress/cp.ckpt
Epoch 3/150




Epoch 00003: saving model to training_progress/cp.ckpt
Epoch 4/150
 70/707 [=>............................] - ETA: 2:15 - loss: 0.1256 - auc: 0.3786 - binary_accuracy: 0.9511 - accuracy: 0.3768 - f1_score: 0.4411 - precision: 0.8770 - recall: 0.1286

KeyboardInterrupt: 

In [None]:
m = tf.keras.metrics.BinaryAccuracy()
m.update_state([[1, 0], [1, 0]], [[1,1], [0, 0]])
m.result().numpy()

In [None]:
class_weights, _, _ = dataset.get_class_weights()
chexnet = load_chexnet(27)
loss_fn = Losses(class_weights)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order

output_dim = len(unique_labels)
chexnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                loss=loss_fn.weighted_binary_crossentropy(),
                #loss='binary_crossentropy',
                metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

### Evaluate CE baseline

In [17]:
chexnet.load_weights(checkpoint_path)
print(chexnet.evaluate(dataset.test_ds))

2021-11-06 14:59:59.816592: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




ValueError: in user code:

    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py:1330 test_function  *
        return step_function(self, iterator)
    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py:1320 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py:1313 run_step  **
        outputs = model.test_step(data)
    /opt/conda/lib/python3.7/site-packages/keras/engine/training.py:1267 test_step
        y_pred = self(x, training=False)
    /opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/keras/engine/functional.py:415 call
        inputs, training=training, mask=mask)
    /opt/conda/lib/python3.7/site-packages/keras/engine/functional.py:550 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/keras/engine/base_layer.py:1020 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /opt/conda/lib/python3.7/site-packages/keras/engine/input_spec.py:218 assert_input_compatibility
        str(tuple(shape)))

    ValueError: Input 0 of layer zero_padding2d is incompatible with the layer: expected ndim=4, found ndim=0. Full shape received: ()


## Precision-Recall Plots (per class)

In [None]:
df.label_nums.keys()

In [None]:
class_name = df.label_nums.keys()
dimension = list(range(0,28))
dim_name_mapping = dict(zip(dimension,class_name))

In [None]:
def precision_recall_curve(y_test, y_pred, class_dim, threshold):
    # Calculate precision - recall over varying threshold
    class_labels_test = y_test[:, class_dim] # list of assuming row vectors!
    class_labels_pred = np.copy(y_pred[:, class_dim])   # list of assuming row vectors!
    class_labels_pred[class_labels_pred >= threshold] = 1 # cast to 1 or 0
    class_labels_pred[class_labels_pred < threshold] = 0 # cast to 1 or 0
    precision, recall, thresholds = precision_recall_curve(class_labels_test, class_labels_pred)
    
    title = 'Precision vs. Recall for' + str(unique_labels[class_dim])# TODO: index into disease name
    plt.plot(recall, precision, color='turquoise', title=title)

## Precision-Recall Stats

In [None]:
def precision_recall_stats(y_test, y_pred, class_dim, threshold):
    # Calculate current precision recall stat
    pass 

## Visualization

In [None]:
# tsne visualization of test img features?