![alt text](https://drive.google.com/uc?export=view&id=1UXScsVx_Wni_JuDdB8LeTnM6jsPfIwkW)

## Model Building for Pneumonia Detection

#### import libraries

In [None]:
import os
import csv
import cv2
import keras
import pydicom
import numpy as np # linear algebra
np.random.seed(42)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
tf.random.set_seed(42)
import random as random
from sklearn import metrics
from sklearn import ensemble
from datetime import datetime
from tensorflow.keras import Sequential
from keras.optimizers import Adam
from keras.utils import Sequence
from skimage.transform import resize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate, UpSampling2D,Conv2D
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization,GlobalMaxPool2D
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, average_precision_score
from sklearn.metrics import roc_auc_score, auc, plot_confusion_matrix, plot_roc_curve, roc_curve

### Files Directories

In [None]:
# Input data files are available in the

dataDir = 'C:/Anaconda/GreatLearning/6. Capstone Project/'
trainDataDir = 'stage_2_train_images'
testDataDir = 'stage_2_test_images'

## first 5 records
train_labels = pd.read_csv(dataDir+'stage_2_train_labels.csv')
train_labels=train_labels.fillna(0)
train_labels.head()

Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0.0,0.0,0.0,0.0,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0.0,0.0,0.0,0.0,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,0.0,0.0,0.0,0.0,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0.0,0.0,0.0,0.0,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1


### Modelling with Convolutional Neural Network 

### Identify those who have pneumonia

In [None]:
opacity_locations = {}
# load table
with open(os.path.join(dataDir+'stage_2_train_labels.csv'), mode='r') as infile:
    # open reader
    reader = csv.reader(infile)
    # skip header
    next(reader, None)
    # loop through rows
    for rows in reader:
        # retrieve information
        #print(reader)
        filename = rows[0]
        #print(filename)
        location = rows[1:5]
        #print(location)
        pneumonia = rows[5]
        #print(pneumonia)
        # if row contains pneumonia add label to dictionary
        # which contains a list of pneumonia locations per filename
        if pneumonia == '1':
            # convert string to float to int
            location = [int(float(i)) for i in location]
            # save pneumonia location in dictionary
            if filename in opacity_locations:
                opacity_locations[filename].append(location)
            else:
                opacity_locations[filename] = [location]

In [None]:
len(opacity_locations)

6012

## As the data available for training is large
- build a data generator class that can help in building the input data in batches to avoid overwhelming the system memory
- use keras sequence class to get the data generator

In [None]:
## parameter
img_width = 224
img_height = 224
IMAGE_SIZE=224
kernel =3
num_of_classes =2
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE=1000

In [None]:
from keras.utils import Sequence
import cv2
import pydicom
from skimage.transform import resize
class generator(keras.utils.Sequence):
    
    def __init__(self, folder, filenames, opacity_locations=None, batch_size=32, image_size=IMAGE_SIZE, shuffle=True, augment=False, predict=False):
        self.folder = folder
        self.filenames = filenames
        self.opacity_locations = opacity_locations
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()
        
    def __load__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # create empty mask
        msk = np.zeros(img.shape)
        # get filename without extension
        filename = filename.split('.')[0]
        # if image contains lung opacity
        if filename in opacity_locations:
            # loop through opacity
            for location in opacity_locations[filename]:
                # add 1's at the location of the lung opacity
                x, y, w, h = location
                msk[y:y+h, x:x+w] = 1
        # if augment then horizontal flip half the time
        if self.augment and random.random() > 0.5:
            img = np.fliplr(img)
            msk = np.fliplr(msk)
        # resize both image and mask
        #img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.5
        # add trailing channel dimension
        msk = np.expand_dims(msk, -1)
         #Converting Image from GrayScale to RGB 
        if len(img.shape) != 3 or img.shape[2] != 3:
            img = np.stack((img,) * 3, -1)
            img = cv2.resize(img, dsize=(self.image_size, self.image_size), interpolation=cv2.INTER_CUBIC)
        return img, msk
    
    def __loadpredict__(self, filename):
        # load dicom file as numpy array
        img = pydicom.dcmread(os.path.join(self.folder, filename)).pixel_array
        # resize image
        #img = resize(img, (self.image_size, self.image_size), mode='reflect')
        #Converting Image from GrayScale to RGB 
        if len(img.shape) != 3 or img.shape[2] != 3:
            img = np.stack((img,) * 3, -1)
            img = cv2.resize(img, dsize=(self.image_size, self.image_size), interpolation=cv2.INTER_CUBIC)
        return img
        
    def __getitem__(self, index):
        # select batch
        filenames = self.filenames[index*self.batch_size:(index+1)*self.batch_size]
        # predict mode: return images and filenames
        if self.predict:
            # load files
            imgs = [self.__loadpredict__(filename) for filename in filenames]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, filenames
        # train mode: return images and masks
        else:
            # load files
            items = [self.__load__(filename) for filename in filenames]
            # unzip images and masks
            imgs, msks = zip(*items)
            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
            return imgs, msks
        
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.filenames)
        
    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.filenames) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.filenames) / self.batch_size)

### Split into train and validation files

In [None]:
dataDir = 'C:/Anaconda/GreatLearning/6. Capstone Project/'
trainDataDir = 'stage_2_train_images'
testDataDir = 'stage_2_test_images'
folder = dataDir+'/stage_2_train_images/'
filenames = os.listdir(folder)
np.random.shuffle(filenames)
# split into train and validation filenames
n_valid_samples = 2500
n_train_samples = len(filenames) - n_valid_samples
train_filenames = filenames[n_valid_samples:]
valid_filenames = filenames[:n_valid_samples]
print('n train samples', len(train_filenames))
print('n valid samples', len(valid_filenames))

n train samples 24184
n valid samples 2500


### Generate Train and Validation data

In [None]:
train_gen = generator(folder, train_filenames,
                      opacity_locations, batch_size=BATCH_SIZE,
                      image_size=IMAGE_SIZE, shuffle=True,augment=False, predict=False)
valid_gen = generator(folder, valid_filenames, 
                      opacity_locations, batch_size=BATCH_SIZE, 
                      image_size=IMAGE_SIZE, shuffle=False, predict=False)

### Resnet50  Model
- Transfer Learning Techniques, ResNet50


In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import Concatenate, UpSampling2D
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Flatten

resnet50 = Sequential()
resnet50.add(ResNet50(input_shape= (img_width, img_height, 3), 
                   include_top=False, 
                   weights='imagenet'))

resnet50.add(Dense(1024, activation='relu'))
resnet50.add(UpSampling2D())
resnet50.add(Dense(512, activation='relu'))
resnet50.add(UpSampling2D())
resnet50.add(Dense(256, activation='relu'))
resnet50.add(UpSampling2D())
resnet50.add(Dense(64, activation='relu'))
resnet50.add(UpSampling2D())
resnet50.add(Dense(8, activation='relu'))
resnet50.add(UpSampling2D())
resnet50.add(Dense(1, activation='sigmoid'))
resnet50.layers[0].trainable = False

### Compiling and Optimizing Augmented Model

In [None]:
#compiling and Optimizing Augmented Model
optimizer =Adam() #(lr=0.0001,decay =1e-5)
resnet50.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
resnet50.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resnet50 (Functional)        (None, 7, 7, 2048)        23587712  
_________________________________________________________________
dense (Dense)                (None, 7, 7, 1024)        2098176   
_________________________________________________________________
up_sampling2d (UpSampling2D) (None, 14, 14, 1024)      0         
_________________________________________________________________
dense_1 (Dense)              (None, 14, 14, 512)       524800    
_________________________________________________________________
up_sampling2d_1 (UpSampling2 (None, 28, 28, 512)       0         
_________________________________________________________________
dense_2 (Dense)              (None, 28, 28, 256)       131328    
_________________________________________________________________
up_sampling2d_2 (UpSampling2 (None, 56, 56, 256)       0

### Performed fitting for the Augmented model with the training and Validation dataset

In [None]:
# this cell may take several minutes to run
start = datetime.now()
resnet50.fit(train_gen,validation_data=valid_gen,epochs=2,steps_per_epoch =5)
end = datetime.now()
elapsed = end - start
print('..........Run Time .........')
print('Time to fit augmented model is:\n {}'.format(elapsed))

Epoch 1/2
Epoch 2/2
..........Run Time .........
Time to fit augmented model is:
 0:09:43.224537


### RESNET50 Model Evaluation

In [None]:
resnet50_ev = resnet50.evaluate(valid_gen)



## VGG16 Model
- Transfer Learning Techniques, VGG16

In [None]:
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input

In [None]:
#VGG16
vgg16 = Sequential()
vgg16.add(VGG16(input_shape= (img_width, img_height, 3), 
                    include_top=False, 
                    weights='imagenet'))
vgg16.add(Dense(1024, activation='relu'))
vgg16.add(UpSampling2D())
vgg16.add(Dense(512, activation='relu'))
vgg16.add(UpSampling2D())
vgg16.add(Dense(256, activation='relu'))
vgg16.add(UpSampling2D())
vgg16.add(Dense(64, activation='relu'))
vgg16.add(UpSampling2D())
vgg16.add(Dense(8, activation='relu'))
vgg16.add(UpSampling2D())
vgg16.add(Dense(1, activation='sigmoid'))
# Say not to train first layer model. It is already trained
vgg16.layers[0].trainable = False

In [None]:
#compiling and Optimizing Augmented Model
optimizer =Adam() #(lr=0.0001,decay =1e-5)
vgg16.compile(loss='binary_crossentropy', 
              optimizer=optimizer , metrics=['accuracy']
                   )
vgg16.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 7, 7, 512)         14714688  
_________________________________________________________________
dense_6 (Dense)              (None, 7, 7, 1024)        525312    
_________________________________________________________________
up_sampling2d_5 (UpSampling2 (None, 14, 14, 1024)      0         
_________________________________________________________________
dense_7 (Dense)              (None, 14, 14, 512)       524800    
_________________________________________________________________
up_sampling2d_6 (UpSampling2 (None, 28, 28, 512)       0         
_________________________________________________________________
dense_8 (Dense)              (None, 28, 28, 256)       131328    
_________________________________________________________________
up_sampling2d_7 (UpSampling2 (None, 56, 56, 256)      

In [None]:
# this cell may take several minutes to run
start = datetime.now()
vgg16.fit(train_gen,validation_data=valid_gen,epochs=2,steps_per_epoch =5)
end = datetime.now()
elapsed = end - start
print('..........Run Time .........')
print('Time to fit augmented model is:\n {}'.format(elapsed))

Epoch 1/2
Epoch 2/2
..........Run Time .........
Time to fit augmented model is:
 0:17:46.553130


### VGG16 Model Evaluation

In [None]:
vgg16_ev = vgg16.evaluate(valid_gen)



## Compare the Model (ResNet50 Vs VGG16)

## Hyper parameter Tuning

**mean iou as a metric**

In [None]:
# mean iou as a metric
def mean_iou(y_true, y_pred):
    y_pred = tf.round(y_pred)
    intersect = tf.reduce_sum(y_true * y_pred, axis=[1, 2, 3])
    union = tf.reduce_sum(y_true, axis=[1, 2, 3]) + tf.reduce_sum(y_pred, axis=[1, 2, 3])
    smooth = tf.ones(tf.shape(intersect))
    return tf.reduce_mean((intersect + smooth) / (union - intersect + smooth))

#### Call Backs (Earlystop, ModelCheckpoint)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
## Earlystopping
earlystop = EarlyStopping(monitor='val_loss', patience=3)

## Model Check point
filepath= "/kaggle/working/LOHPT-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only = True)

## Reduce learning rate when metric has stopped improving
reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss', factor=0.8, 
                                   patience=2, verbose=1, mode='auto', 
                                   min_delta=0.0001, cooldown=5, min_lr=0.0001)

#### Compilation

In [None]:
#compiling and Optimizing Augmented Model
optimizer =Adam(lr=0.0001,decay =1e-5)
resnet50.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])

### Fitting the model

In [None]:
history = resnet50.fit(train_gen, epochs=5)

In [None]:
resnet50.save_weights('FINAL_RESNET50.hdf5')

In [None]:
print('Below is the representation of variation of loss and mean_iou for training and validation data.')
plt.figure(figsize=(15,6))
plt.subplot(121)
plt.plot(history.epoch, history.history["loss"], label="Train loss")
plt.plot(history.epoch, history.history["val_loss"], label="Valid loss")
plt.legend()
plt.subplot(122)
plt.plot(history.epoch, history.history["mean_iou"], label="Train iou")
plt.plot(history.epoch, history.history["val_mean_iou"], label="Valid iou")
plt.legend()
plt.show()