# MetaCheX

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
import tensorflow_addons as tfa
from glob import glob
# from keras.utils.np_utils import to_categorical   
# from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# metachex materials
from metachex.configs.config import *
from metachex.dataloader import MetaChexDataset
from metachex.loss import Losses

In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"]=""
tf.test.is_gpu_available()
physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


2021-11-05 22:52:42.029080: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-05 22:52:42.085196: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:42.094207: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:42.094642: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

### Instantiate Dataset and Splits

In [3]:
# Instantiate dataset
dataset = MetaChexDataset()
# Get class weights (feed this into model construction for weighted loss)
# indiv_class_weights, combo_class_weights = dataset.get_class_weights()
# Grab training dataset
train_ds = dataset.train_ds

[INFO] pre-processing
Data already processed. Loading from save data/data.pkl
[INFO] truncating dataset
[INFO] constructing tf train/val/test vars


2021-11-05 22:52:43.264466: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:43.264976: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:43.265372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:43.265779: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-05 22:52:43.266144: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

[INFO] shuffle & batch
[INFO] initialized


In [4]:
# elaine_repo_path = '/home/elainesui/MetaCheX'
# os.chdir(elaine_repo_path)

In [5]:
images, labels = next(iter(train_ds))
images.shape

2021-11-05 22:52:44.333188: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-05 22:52:44.403980: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


TensorShape([8, 224, 224, 3])

### Get data stats: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [6]:
# Check out the data
unique_labels_dict, df_combo_counts, df_label_nums, df_combo_nums = dataset.get_data_stats(dataset.df)

print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels (includes 'No Finding'): ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Bottom 20)")
display(df_combo_nums.tail(20))
print("\n")
display(df_combo_counts.head())

Number of total images:  113682
Number of total individual labels (includes 'No Finding'):  36
Number of total label combos (includes individual labels):  822
****************************
Number of images with each individual label


Unnamed: 0,count
No Finding,20000
Infiltration,19894
Effusion,13317
Atelectasis,11559
Nodule,6331
Lung_Opacity,6012
Mass,5782
Pneumothorax,5302
Consolidation,4667
COVID-19,4179




Number of images with each combo label (Bottom 20)


Unnamed: 0,count
Effusion|Fibrosis|Mass|Pleural_Thickening,1
Atelectasis|Effusion|Fibrosis|Pneumothorax,1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Pleural_Thickening,1
Atelectasis|Mass|Nodule|Pneumonia,1
Atelectasis|Emphysema|Fibrosis|Infiltration,1
Effusion|Emphysema|Infiltration|Mass,1
Edema|Infiltration|Mass|Nodule,1
Atelectasis|Consolidation|Effusion|Infiltration|Mass|Nodule|Pneumothorax,1
Emphysema|Mass|Pneumonia|Pneumothorax,1
Cardiomegaly|Effusion|Infiltration|Pneumonia,1






Unnamed: 0,count interval,number of labels
0,< 5,494
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


### Get updated data stats

In [7]:
print("Stats for condensed dataset")
print("---------------------------")
unique_labels_dict, df_combo_counts, df_label_nums, df_combo_nums = dataset.get_data_stats(dataset.df_condensed)
print(dataset.df_condensed['label_multitask'][0].shape)

print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels (includes 'No Finding'): ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Bottom 20)")
display(df_combo_nums.tail(20))
print("\n")
display(df_combo_counts.head())

## Just making sure that the weights match with the correct labels
indiv = df_label_nums
indiv_weights = (1 / indiv['count']) * (indiv['count'].sum() / indiv.shape[0])
indiv_weights = indiv_weights.sort_index()
indiv_weights = indiv_weights.drop(['No Finding'])
display(indiv_weights)
indiv_class_weights = dict(list(enumerate(indiv_weights.values)))
display(indiv_class_weights)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order
print(unique_labels)

Stats for condensed dataset
---------------------------
(27,)
Number of total images:  110224
Number of total individual labels (includes 'No Finding'):  28
Number of total label combos (includes individual labels):  328
****************************
Number of images with each individual label


Unnamed: 0,count
No Finding,20000
Infiltration,19510
Effusion,12915
Atelectasis,11201
Nodule,6087
Lung_Opacity,6012
Mass,5470
Pneumothorax,5062
Consolidation,4398
COVID-19,4179




Number of images with each combo label (Bottom 20)


Unnamed: 0,count
Emphysema|Pneumonia,5
Consolidation|Effusion|Infiltration|Pneumothorax,5
Atelectasis|Nodule|Pneumonia,5
Effusion|Emphysema|Nodule,5
Atelectasis|Consolidation|Effusion|Emphysema,5
Cardiomegaly|Effusion|Emphysema|Pneumothorax,5
Consolidation|Effusion|Infiltration|Pneumonia,5
Cardiomegaly|Effusion|Infiltration|Pleural_Thickening,5
Atelectasis|Effusion|Nodule|Pleural_Thickening,5
Emphysema|Mass|Nodule,5






Unnamed: 0,count interval,number of labels
0,< 5,0
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


Atelectasis             0.351448
COVID-19                0.941989
Cardiomegaly            1.532336
Consolidation           0.895082
Edema                   1.867444
Effusion                0.304806
Emphysema               1.689516
Fibrosis                2.552900
Hernia                 22.240517
Infiltration            0.201772
Influenza             787.314286
Klebsiella            393.657143
Legionella            393.657143
Lung_Opacity            0.654786
MERS-CoV              393.657143
Mass                    0.719666
Mycoplasma            357.870130
Nocardia              492.071429
Nodule                  0.646718
Pleural_Thickening      1.262126
Pneumocystis          131.219048
Pneumonia               1.158156
Pneumothorax            0.777671
SARS                  246.035714
Streptococcus         178.935065
Tuberculosis          218.698413
Varicella             656.095238
Name: count, dtype: float64

{0: 0.35144821253204434,
 1: 0.9419888558438451,
 2: 1.532336095201023,
 3: 0.8950821802117845,
 4: 1.8674437516942262,
 5: 0.3048061501023173,
 6: 1.6895156345800122,
 7: 2.55289975912544,
 8: 22.24051654560129,
 9: 0.20177198506260527,
 10: 787.3142857142857,
 11: 393.65714285714284,
 12: 393.65714285714284,
 13: 0.6547856667617147,
 14: 393.65714285714284,
 15: 0.7196657090624184,
 16: 357.87012987012986,
 17: 492.07142857142856,
 18: 0.6467178295665235,
 19: 1.2621261393303715,
 20: 131.21904761904761,
 21: 1.1581557600975074,
 22: 0.7776711632894959,
 23: 246.03571428571428,
 24: 178.93506493506493,
 25: 218.69841269841268,
 26: 656.0952380952381}

['Atelectasis', 'COVID-19', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Influenza', 'Klebsiella', 'Legionella', 'Lung_Opacity', 'MERS-CoV', 'Mass', 'Mycoplasma', 'Nocardia', 'Nodule', 'Pleural_Thickening', 'Pneumocystis', 'Pneumonia', 'Pneumothorax', 'SARS', 'Streptococcus', 'Tuberculosis', 'Varicella']


## Step 2: Finetuned CheXNet Baseline

Note: CheXNet = DenseNet121 trained on ChestX-ray14 dataset (multi-task binary classification)

Pre-trained weights: https://github.com/brucechou1983/CheXNet-Keras

In [8]:
# Magic nunbers
batchsize=8
num_classes = 27
def load_chexnet_pretrained(class_names=np.arange(14), weights_path='chexnet_weights.h5', 
                            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)):

    img_input = tf.keras.layers.Input(shape=input_shape)
    base_model = tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None, 
                                                            input_tensor=img_input, pooling='avg')
    base_model.trainable = False


    x = base_model.output
    predictions = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = tf.keras.models.Model(inputs=img_input, outputs=predictions)
    model.load_weights(weights_path)

    return model


def load_chexnet(output_dim):
    """
    output_dim: dimension of output
    """
    
    base_model_old = load_chexnet_pretrained()
    x = base_model_old.layers[-2].output ## remove old prediction layer
    
    ## The prediction head can be more complicated if you want
    predictions = tf.keras.layers.Dense(output_dim, name='prediction', activation='sigmoid')(x)
    chexnet = tf.keras.models.Model(inputs=base_model_old.inputs,outputs=predictions)
    return chexnet
    
chexnet = load_chexnet(num_classes)
print(chexnet.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d[0][0]             
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

In [9]:
chexnet.layers[-1].name

'prediction'

In [10]:
train_ds

<BatchDataset shapes: ((None, 224, 224, 3), (None, 27)), types: (tf.float32, tf.uint8)>

### Train baseline -- multi-task binary classification

In [11]:
checkpoint_path = "training_progress/cp.ckpt" # path for saving model weights
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [12]:
class_weights, _, _ = dataset.get_class_weights(one_cap=False)

loss_fn = Losses(class_weights, batch_size=batchsize)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order

output_dim = len(unique_labels)
chexnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
                  loss=loss_fn.weighted_binary_crossentropy(),
#                 loss='binary_crossentropy',
                metrics=[tf.keras.metrics.AUC(multi_label=True),  'binary_accuracy', 'accuracy', tfa.metrics.F1Score(average='micro',num_classes=num_classes),  \
                         tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
#                 run_eagerly=True)

epochs = 150
hist = chexnet.fit(dataset.train_ds,
            validation_data=dataset.val_ds,
            epochs=epochs,
            steps_per_epoch=964, # ~110224*.7*.125*0.1
            batch_size=batchsize,
            callbacks=[cp_callback])

Epoch 1/150


2021-11-05 22:52:50.580917: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204
2021-11-05 22:52:51.240745: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


 13/964 [..............................] - ETA: 14s - loss: 64.5341 - auc: 0.0438 - binary_accuracy: 0.8141 - accuracy: 0.0769 - f1_score: 0.1389 - precision: 0.0383 - recall: 0.1518     

2021-11-05 22:52:52.453110: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.







Epoch 00001: saving model to training_progress/cp.ckpt
Epoch 2/150
 86/964 [=>............................] - ETA: 14s - loss: 0.8579 - auc: 0.4312 - binary_accuracy: 0.9638 - accuracy: 0.5218 - f1_score: 0.4927 - precision: 0.6690 - recall: 0.3533








Epoch 00002: saving model to training_progress/cp.ckpt
Epoch 3/150




Epoch 00003: saving model to training_progress/cp.ckpt
Epoch 4/150




Epoch 00004: saving model to training_progress/cp.ckpt
Epoch 5/150




Epoch 00005: saving model to training_progress/cp.ckpt
Epoch 6/150




Epoch 00006: saving model to training_progress/cp.ckpt
Epoch 7/150




Epoch 00007: saving model to training_progress/cp.ckpt
Epoch 8/150




Epoch 00008: saving model to training_progress/cp.ckpt
Epoch 9/150




Epoch 00009: saving model to training_progress/cp.ckpt
Epoch 10/150





Epoch 00010: saving model to training_progress/cp.ckpt


In [13]:
m = tf.keras.metrics.BinaryAccuracy()
m.update_state([[1, 0], [1, 0]], [[1,1], [0, 0]])
m.result().numpy()

0.5

In [14]:
class_weights, _, _ = dataset.get_class_weights()
chexnet = load_chexnet(27)
loss_fn = Losses(class_weights)

unique_labels = list(unique_labels_dict.keys())
unique_labels.remove('No Finding')
unique_labels.sort() ## alphabetical order

output_dim = len(unique_labels)
chexnet.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                loss=loss_fn.weighted_binary_crossentropy(),
                #loss='binary_crossentropy',
                metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

### Evaluate CE baseline

In [15]:
print(chexnet.evaluate(dataset.test_ds))



ValueError: in user code:

    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/training.py:1330 test_function  *
        return step_function(self, iterator)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/training.py:1320 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/training.py:1313 run_step  **
        outputs = model.test_step(data)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/training.py:1267 test_step
        y_pred = self(x, training=False)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/base_layer.py:1037 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/functional.py:414 call
        return self._run_internal_graph(
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/functional.py:550 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/base_layer.py:1020 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /home/edwin/anaconda3/envs/metachex/lib/python3.8/site-packages/keras/engine/input_spec.py:214 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer zero_padding2d_2 is incompatible with the layer: expected ndim=4, found ndim=0. Full shape received: ()


## Precision-Recall Plots (per class)

In [None]:
df.label_nums.keys()

In [None]:
class_name = df.label_nums.keys()
dimension = list(range(0,28))
dim_name_mapping = dict(zip(dimension,class_name))

In [None]:
def precision_recall_curve(y_test, y_pred, class_dim, threshold):
    # Calculate precision - recall over varying threshold
    class_labels_test = y_test[:, class_dim] # list of assuming row vectors!
    class_labels_pred = np.copy(y_pred[:, class_dim])   # list of assuming row vectors!
    class_labels_pred[class_labels_pred >= threshold] = 1 # cast to 1 or 0
    class_labels_pred[class_labels_pred < threshold] = 0 # cast to 1 or 0
    precision, recall, thresholds = precision_recall_curve(class_labels_test, class_labels_pred)
    
    title = 'Precision vs. Recall for' + str(unique_labels[class_dim])# TODO: index into disease name
    plt.plot(recall, precision, color='turquoise', title=title)

## Precision-Recall Stats

In [None]:
def precision_recall_stats(y_test, y_pred, class_dim, threshold):
    # Calculate current precision recall stat
    pass 

## Visualization

In [None]:
# tsne visualization of test img features?