# MetaCheX

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import tensorflow as tf
from glob import glob
# from keras.utils.np_utils import to_categorical   
# from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_curve
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# metachex materials
from metachex.configs.config import *
from metachex.dataloader import MetaChexDataset

In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"]=""
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


2021-11-03 01:15:17.358809: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-03 01:15:17.429279: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:17.442086: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:17.443106: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

True

### Instantiate Dataset and Splits

In [3]:
# Instantiate dataset
dataset = MetaChexDataset()
# Get class weights (feed this into model construction for weighted loss)
indiv_class_weights, combo_class_weights = dataset.get_class_weights()
# Grab training dataset
train_ds = dataset.train_ds

[INFO] pre-processing
Data already processed. Loading from save ./data/data.pkl
[INFO] truncating dataset
[INFO] constructing tf train/val/test vars


2021-11-03 01:15:18.625110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:18.625620: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:18.625996: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:18.626536: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-03 01:15:18.626913: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

[INFO] shuffle & batch
[INFO] initialized


In [4]:
images, labels = next(iter(train_ds))
images.shape

2021-11-03 01:15:20.578721: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-03 01:15:20.657661: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


TensorShape([32, 224, 224, 3])

### Number of unlabelled or unknown labels in covid-chestxray-dataset

In [3]:
full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_CHESTXRAY_METADATA_PATH)
df_cc = pd.read_csv(full_path)[['filename', 'finding']]
df_cc.rename(columns={'filename': 'image_path', 'finding': 'label'}, inplace=True)
df_cc = df_cc.reset_index(drop=True)
unknown = df_cc[(df_cc['label'] == 'todo') | (df_cc['label'] == 'Unknown')]
print(len(unknown))

84


In [4]:
full_path = os.path.join(PATH_TO_DATA_FOLDER, COVID_19_RADIOGRAPHY_IMAGES)
df_cr = pd.DataFrame(columns=['image_path', 'label'])
image_lst = sorted(list(glob(f"{full_path}/*"))) ## gets list of all image filepaths
label_arr = np.array([f[f.rindex('/') + 1:f.rindex('-')] for f in image_lst])
np.unique(label_arr)

array(['COVID', 'Lung_Opacity', 'Normal', 'Viral Pneumonia'], dtype='<U15')

### Get data stats: 
- Number of images with each label (individual)
- Number of unique labels (individual)
- Number of labels total (including combos)

In [5]:
# Check out the data
unique_labels_dict, df_combo_counts, df_label_nums, df_combo_nums = dataset.get_data_stats(dataset.df)

print("Number of total images: ", df_label_nums['count'].sum())
print("Number of total individual labels (includes 'No Finding'): ", df_label_nums.shape[0])
print("Number of total label combos (includes individual labels): ", df_combo_nums.shape[0])
print("****************************")
print("Number of images with each individual label")
display(df_label_nums)
print("\n")
print("Number of images with each combo label (Bottom 20)")
display(df_combo_nums.tail(20))
print("\n")
display(df_combo_counts.head())


Number of total images:  164299
Number of total individual labels (includes 'No Finding'):  36
Number of total label combos (includes individual labels):  822
****************************
Number of images with each individual label


Unnamed: 0,count
No Finding,70575
Infiltration,19894
Effusion,13317
Atelectasis,11559
Nodule,6331
Lung_Opacity,6012
Mass,5782
Pneumothorax,5302
Consolidation,4667
COVID-19,4200




Number of images with each combo label (Bottom 20)


Unnamed: 0,count
Effusion|Fibrosis|Mass|Pleural_Thickening,1
Atelectasis|Effusion|Fibrosis|Pneumothorax,1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Pleural_Thickening,1
Atelectasis|Mass|Nodule|Pneumonia,1
Atelectasis|Emphysema|Fibrosis|Infiltration,1
Effusion|Emphysema|Infiltration|Mass,1
Edema|Infiltration|Mass|Nodule,1
Atelectasis|Consolidation|Effusion|Infiltration|Mass|Nodule|Pneumothorax,1
Emphysema|Mass|Pneumonia|Pneumothorax,1
Cardiomegaly|Effusion|Infiltration|Pneumonia,1






Unnamed: 0,count interval,number of labels
0,< 5,494
1,"[5, 100)",266
2,"[100, 1k)",46
3,"[1k, 10k)",15
4,>= 10k,1


### Get updated data stats

In [6]:
print("Stats for condensed dataset")
print("---------------------------")
dataset.get_data_stats(dataset.df_condensed)

Stats for condensed dataset
---------------------------


({'Cardiomegaly': 2569,
  'Emphysema': 2330,
  'Effusion': 12915,
  'No Finding': 70575,
  'Hernia': 177,
  'Infiltration': 19510,
  'Mass': 5470,
  'Nodule': 6087,
  'Atelectasis': 11201,
  'Pneumothorax': 5062,
  'Pleural_Thickening': 3119,
  'Fibrosis': 1542,
  'Edema': 2108,
  'Consolidation': 4398,
  'Pneumonia': 3420,
  'COVID-19': 4200,
  'SARS': 16,
  'Pneumocystis': 30,
  'Streptococcus': 22,
  'Klebsiella': 10,
  'Legionella': 10,
  'Varicella': 6,
  'Mycoplasma': 11,
  'Influenza': 5,
  'Tuberculosis': 18,
  'Nocardia': 8,
  'MERS-CoV': 10,
  'Lung_Opacity': 6012},
   count interval  number of labels
 0            < 5                 0
 1       [5, 100)               266
 2      [100, 1k)                46
 3      [1k, 10k)                15
 4         >= 10k                 1,
                     count
 No Finding          70575
 Infiltration        19510
 Effusion            12915
 Atelectasis         11201
 Nodule               6087
 Lung_Opacity         6012
 Mass      

## Step 2: Finetuned CheXNet Baseline

Note: CheXNet = DenseNet121 trained on ChestX-ray14 dataset (multi-task binary classification)

Pre-trained weights: https://github.com/brucechou1983/CheXNet-Keras

In [7]:
def load_chexnet_pretrained(class_names=np.arange(14), weights_path='chexnet_weights.h5', 
                            input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3)):

    img_input = tf.keras.layers.Input(shape=input_shape)
    base_model = tf.keras.applications.densenet.DenseNet121(include_top=False, weights=None, 
                                                            input_tensor=img_input, pooling='avg')


    x = base_model.output
    predictions = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="predictions")(x)
    model = tf.keras.models.Model(inputs=img_input, outputs=predictions)
    model.load_weights(weights_path)

    return model


def load_chexnet(output_dim):
    """
    output_dim: dimension of output
    """
    
    base_model_old = load_chexnet_pretrained()
    x = base_model_old.layers[-2].output ## remove old prediction layer
    
    ## The prediction head can be more complicated if you want
    predictions = tf.keras.layers.Dense(output_dim, activation='softmax', name='prediction')(x)
    chexnet = tf.keras.models.Model(inputs=base_model_old.inputs, outputs=predictions)
    return chexnet
    
chexnet = load_chexnet(34)
print(chexnet.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
zero_padding2d (ZeroPadding2D)  (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1/conv (Conv2D)             (None, 112, 112, 64) 9408        zero_padding2d[0][0]             
__________________________________________________________________________________________________
conv1/bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1/conv[0][0]                 
____________________________________________________________________________________________

### Class balancing -- for finetuned CheXNet baseline and finetuned CheXNet w/ supervised contrastive learning
Data augmentation (minor rotations, flips) and oversampling of minority classes (classes w/ < 1k examples); undersampling majority classes (classes w/ > 10k examples)

In [8]:
## TODO

### Data split -- note that the split on NIH data must coincide with pre-trained split (or else leakage)

In [9]:
## TODO

### Train baseline -- multi-task binary classification

In [10]:
train_x, train_y, val_x, val_y, test_x, test_y = train_val_test_split(balanced_data, balanced_labels)

output_dim = ..
chexnet_ce = load(output_dim)
chexnet_ce.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

batch_size = ..
epochs = ..
chexnet_ce.fit(x=train_x, y=train_y,
              validation_data=(val_x, val_y),
              batch_size=,
              epochs=)

SyntaxError: invalid syntax (3724479205.py, line 3)

### Evaluate CE baseline

In [None]:
print(chexnet_ce.evaluate(test_x, test_y))