## Connect to google drive using PyDrive API

In [0]:
# Connect to google Drive

!pip install -U -q PyDrive

import tensorflow as tf
import timeit

config = tf.ConfigProto()
config.gpu_options.allow_growth = True

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Save the data.zip (45 GB) to drive, to avoid downloading the dataset after every 12 h
# Upload file to google drive from the colab notebook's "current files"
uploaded = drive.CreateFile({'title': 'sample.zip'})

# Change the name of the file to the actual file
uploaded.SetContentFile('sample.zip')

# Trigger the upload (will take around 10 mins)
uploaded.Upload()

# Will show the file ID on successful upload to drive.
print('Uploaded file with ID {}'.format(uploaded.get('id')))



---



---



## Download and extract data into Colab Environment

In [0]:
!ls -l

In [0]:
#@title Enter DataSet URL { display-mode: "form" }
URL = "https://storage.googleapis.com/kaggle-datasets/4667/7773/sample.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1523541545&Signature=I%2FgQIQOrG5K7W4DvdIfaBCfBOC72WviXlvsZP93AB0EjFOZuWSBYA%2FqAbZ%2FDSty2wXzf8DqT3hyiNGtBSUKYeIofyJhbWYogV4ML0BwjPmDi0sgbm1ybGGeNalLW%2BewWcRONVy4ZL%2B5%2BG7HLhEGRgKLKQAnxBlouQPgoeEztYf1bEK23LHYc3H9Az8iFiWC9PwjtNT41bnd33t6KMfWXAGGRkC0%2F2EBfDwqVBdlJEqkX%2BND8ON7yQHDUOOEmyK9Bm3AajE0yvpfQw1V40LbDq8og3DwJwRQF5rRYMMsqCy7RdAv55N0IK3nDbN4baxNQc5tyqupurq9NROSmngdxwA%3D%3D" #@param {type:"string"}

In [0]:
!rm -rf /content/DATA/
!wget '$URL' -O data.zip
!mkdir DATA
!unzip data.zip -d DATA
%cd DATA/
!unzip '*.zip'
!rm *.zip
%cd ..
!rm data.zip
![ -e DATA/sample_labels.csv ] && mv DATA/sample_labels.csv DATA/Data_Entry_2017.csv

In [0]:
!ls 

## Preprocessing

In [0]:
# import dependencies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [0]:
data = pd.read_csv('/content/DATA/Data_Entry_2017.csv')
data.info()


# Constant dicts/lists
patho_list = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation','Infiltration','Fibrosis','Pneumonia','No Finding']
gender_map = {'M' :0 ,'F' :1 }
VP_map     = {'PA':0 ,'AP':1 }

In [0]:
def clean_age(df) :
  df['Age Type']                            = df['Patient Age'].apply(lambda x: x[-1:])
  df['Age']                                 = df['Patient Age'].apply(lambda x: x[:-1]).astype(int)
  df.loc[df['Age Type'] == 'M',['Age']]     = df[df['Age Type']=='M']['Age'].apply(lambda x: round(x/12.)).astype(int)
  df.loc[df['Age Type'] == 'D',['Age']]     = df[df['Age Type']=='D']['Age'].apply(lambda x: round(x/365.)).astype(int)
  df.drop(['Patient Age','Age Type'],axis=1,inplace=True)
  return df
  

def preprocess(df,G_map,V_map,D_list,MIN_CASES):
  
  ##### STEP 1: get rid of | seperated diseases
  #######################################################################
  # Convert | seperated diseases to columns with binary values
  for dis in D_list :
    df[dis] = df['Finding Labels'].apply(lambda x: 1 if dis in x else 0)
  #######################################################################
  
  
  ##### STEP 2: clean the age,gender,view position columns
  #######################################################################
  # Patien Age is in years,days,months
  df = clean_age(df)
  # Converts gender,view position to int
  df.replace({'Patient Gender':G_map, 'View Position':V_map},inplace=True)
  #######################################################################
  
  
  ##### STEP 3: map image location with correct datapoint
  #######################################################################
  # abs path of all the png images
  all_image_paths = {os.path.basename(x): x for x in glob.glob(os.path.join('/content','DATA','images','*.png'))}
  # Maps Image Index with its respective path
  df['path'] = df['Image Index'].map(all_image_paths.get)
  #######################################################################
  
  
  ##### STEP 4: choose diseases which have atleast MIN_CASES
  #######################################################################
  all_labels = [c_label for c_label in patho_list if data[c_label].sum() > MIN_CASES]
  #######################################################################
  
  
  ##### STEP 5: Convert the considered labels to one hot encoding
  #######################################################################
  # data.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])
  data['disease_vec'] = data[all_labels].values.tolist()
  #######################################################################
  

  ##### STEP 6: Drop unnecessary columns
  #######################################################################
  drop_list = ['OriginalImageWidth','OriginalImageHeight','OriginalImagePixelSpacing_x','OriginalImagePixelSpacing_y']
  df.drop(drop_list, axis=1,inplace=True)
  #######################################################################
  
  
  ##### STEP 7: normalise the data
  #######################################################################
  # since the dataset is very unbiased, we can resample it to be a more reasonable collection
  # weight is 0.1 + number of findings
  num_diseases = df['Finding Labels'].map(lambda x: len(x.split('|')) if len(x)>0 else 0).values + 4e-2
  num_diseases /= num_diseases.sum()
  df = df.sample(np.shape(df)[0],weights=num_diseases)
  #######################################################################
  
  
  return df,all_labels

In [0]:
data,all_labels = preprocess(data, gender_map, VP_map, patho_list,0)
data['disease_vec']

## IMPORTING IMAGES & VISUALISATION

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
label_counts = data['Finding Labels'].value_counts()[:15]
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

Method 1: Use garbage collector to clear the RAM (12 GB) after every image load, going to be slow but, it won't crash the runtime

Method 2: Use PIL to import images as PIL object / convert it to numpy array

In [0]:
from PIL import Image
Standard_size = (512,512)
data['images'] = data['path'].apply(lambda x : Image.open(x))

# from skimage import io
# from skimage import color
# np.shape(io.imread(images['path'][0], as_grey=True))

In [0]:
data.info()

In [0]:
data['images'].head()

## Model (Still incomplete)

In [0]:
from keras import layers
from keras.layers import Input, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.models import Model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

import keras.backend as K
K.set_image_data_format('channels_last')
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

def XrayModel(input_shape):
    """
    Arguments:
    input_shape -- shape of the images of the dataset

    Returns:
    model -- a Model() instance in Keras
    """
    
    ### START CODE HERE ###
    # Feel free to use the suggested outline in the text above to get started, and run through the whole
    # exercise (including the later portions of this notebook) once. The come back also try out other
    # network architectures as well. 
    X_input = Input(input_shape)

    # Zero-Padding: pads the border of X_input with zeroes
    # X = ZeroPadding2D((3, 3))(X_input)
    # CONV -> BN -> RELU Block applied to X
    X = Conv2D(32, (7, 7), strides = (1, 1), name = 'conv0')(X)
    X = BatchNormalization(axis = 3, name = 'bn0')(X)
    X = Activation('relu')(X)

    # MAXPOOL
    X = MaxPooling2D((2, 2), name='max_pool')(X)

    # FLATTEN X (means convert it to a vector) + FULLYCONNECTED
    X = Flatten()(X)
    X = Dense(1, activation='relu', name='fc')(X)

    # Create model. This creates your Keras model instance, you'll use this instance to train/test the model.
    model = Model(inputs = X_input, outputs = X, name='XrayModel')

    return model

In [0]:
X = data.loc[:, data.columns != 'OHE']
y = data['OHE']

In [0]:
XrayModel = HappyModel(X_train.shape[1:])

In [0]:
XrayModel.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [0]:
XrayModel.fit(x = X_train, y = Y_train, epochs = 5, batch_size=16)

In [0]:
from keras.applications import VGG16

In [100]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(data, 
                                   test_size = 0.25, 
                                   random_state = 2018,
                                   stratify = data['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

train 4204 validation 1402


In [101]:
train_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Gender,View Position,Cardiomegaly,Emphysema,Effusion,Hernia,...,Edema,Consolidation,Infiltration,Fibrosis,Pneumonia,No Finding,Age,path,disease_vec,images
4189,00020534_001.png,No Finding,1,20534,1,0,0,0,0,0,...,0,0,0,0,0,1,56,/content/DATA/images/00020534_001.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
5267,00028298_000.png,Infiltration,0,28298,1,0,0,0,0,0,...,0,0,1,0,0,0,51,/content/DATA/images/00028298_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
3107,00015338_014.png,No Finding,14,15338,0,1,0,0,0,0,...,0,0,0,0,0,1,44,/content/DATA/images/00015338_014.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
2916,00014358_016.png,Pneumothorax,16,14358,1,0,0,0,0,0,...,0,0,0,0,0,0,43,/content/DATA/images/00014358_016.png,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
3690,00018117_000.png,Mass,0,18117,0,0,0,0,0,0,...,0,0,0,0,0,0,34,/content/DATA/images/00018117_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
1665,00008701_011.png,Infiltration,11,8701,1,1,0,0,0,0,...,0,0,1,0,0,0,46,/content/DATA/images/00008701_011.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
3722,00018253_063.png,Infiltration,63,18253,1,1,0,0,0,0,...,0,0,1,0,0,0,71,/content/DATA/images/00018253_063.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
4594,00022919_002.png,Nodule,2,22919,1,0,0,0,0,0,...,0,0,0,0,0,0,39,/content/DATA/images/00022919_002.png,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
2794,00013966_000.png,No Finding,0,13966,0,0,0,0,0,0,...,0,0,0,0,0,1,72,/content/DATA/images/00013966_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",<PIL.PngImagePlugin.PngImageFile image mode=L ...
385,00002003_005.png,Pleural_Thickening,5,2003,1,0,0,0,0,0,...,0,0,0,0,0,0,42,/content/DATA/images/00002003_005.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",<PIL.PngImagePlugin.PngImageFile image mode=L ...


# Do Testing here

In [98]:
from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (128, 128)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

Using TensorFlow backend.


In [0]:
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, class_mode = 'sparse', **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = '' # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen

In [0]:
train_gen = flow_from_dataframe(core_idg, train_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'grayscale',
                            batch_size = 32)

valid_gen = flow_from_dataframe(core_idg, valid_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'grayscale',
                            batch_size = 256) 

# we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm

test_X, test_Y = next(flow_from_dataframe(core_idg, 
                               valid_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'grayscale',
                            batch_size = 1024)) # one big batch

In [0]:
t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone', vmin = -1.5, vmax = 1.5)
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')

In [0]:
from keras.applications.mobilenet import MobileNet
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
from keras.models import Sequential

base_mobilenet_model = MobileNet(input_shape =  t_x.shape[1:], include_top = False, weights = None)
multi_disease_model = Sequential()
multi_disease_model.add(base_mobilenet_model)
multi_disease_model.add(GlobalAveragePooling2D())
multi_disease_model.add(Dropout(0.5))
multi_disease_model.add(Dense(512))
multi_disease_model.add(Dropout(0.5))
multi_disease_model.add(Dense(len(all_labels), activation = 'sigmoid'))
multi_disease_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
                           metrics = ['binary_accuracy', 'mae'])
multi_disease_model.summary()

In [0]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_weights.best.hdf5".format('xray_class')

checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=3)
callbacks_list = [checkpoint, early]


In [0]:
multi_disease_model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_data = (test_X, test_Y), 
                                  epochs = 1, 
                                  callbacks = callbacks_list)

In [0]:
for c_label, s_count in zip(all_labels, 100*np.mean(test_Y,0)):
    print('%s: %2.2f%%' % (c_label, s_count))

In [0]:
pred_Y = multi_disease_model.predict(test_X, batch_size = 32, verbose = True)

In [0]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), pred_Y[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('barely_trained_net.png')

In [114]:
multi_disease_model.fit_generator(train_gen, 
                                  steps_per_epoch = 100,
                                  validation_data =  (test_X, test_Y), 
                                  epochs = 5, 
                                  callbacks = callbacks_list)

Epoch 1/5

Epoch 00001: val_loss improved from 0.68363 to 0.56792, saving model to xray_class_weights.best.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 0.56792 to 0.23477, saving model to xray_class_weights.best.hdf5
Epoch 3/5


Epoch 00003: val_loss improved from 0.23477 to 0.23264, saving model to xray_class_weights.best.hdf5
Epoch 4/5

Epoch 00004: val_loss improved from 0.23264 to 0.22717, saving model to xray_class_weights.best.hdf5
Epoch 5/5


Epoch 00005: val_loss improved from 0.22717 to 0.22363, saving model to xray_class_weights.best.hdf5


<keras.callbacks.History at 0x7f99c66bd4a8>

In [0]:
multi_disease_model.load_weights(weight_path)

In [0]:
pred_Y = multi_disease_model.predict(test_X, batch_size = 32, verbose = True)


In [0]:
# look at how often the algorithm predicts certain diagnoses 
for c_label, p_count, t_count in zip(all_labels, 
                                     100*np.mean(pred_Y,0), 
                                     100*np.mean(test_Y,0)):
    print('%s: Dx: %2.2f%%, PDx: %2.2f%%' % (c_label, t_count, p_count))

In [0]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), pred_Y[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('trained_net.png')

In [0]:
sickest_idx = np.argsort(np.sum(test_Y, 1)<1)
fig, m_axs = plt.subplots(4, 2, figsize = (16, 32))
for (idx, c_ax) in zip(sickest_idx, m_axs.flatten()):
    c_ax.imshow(test_X[idx, :,:,0], cmap = 'bone')
    stat_str = [n_class[:6] for n_class, n_score in zip(all_labels, 
                                                                  test_Y[idx]) 
                             if n_score>0.5]
    pred_str = ['%s:%2.0f%%' % (n_class[:4], p_score*100)  for n_class, n_score, p_score in zip(all_labels, 
                                                                  test_Y[idx], pred_Y[idx]) 
                             if (n_score>0.5) or (p_score>0.5)]
    c_ax.set_title('Dx: '+', '.join(stat_str)+'\nPDx: '+', '.join(pred_str))
    c_ax.axis('off')
fig.savefig('trained_img_predictions.png')


### Notes

Converts Finding data to str and gets OHE dummies using | as seperator
```
data['Finding Labels'].str.get_dummies('|')
disease_list = data['Finding Labels'].str.get_dummies('|').columns.values
```
IDEA 
Add a dummy so that all the columns/diseases are present in the testing set also.
```
df.loc[-1] = ['DUMMY','|'.join(D_list),-1,-1,-1,'M','PA',-1,-1,-1.0,-1.0,'NaN']
```
