###  <span style="color:red">**This Notebook can be run from Google Colab:**</span>

https://colab.research.google.com

In [0]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import zipfile
import shutil
from google.colab import files
import json
import time
import pandas as pd

import keras
from keras.models import Model, Sequential, load_model
from keras.applications.resnet50 import ResNet50
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization,\
                          Conv2D, MaxPooling2D, Flatten, AveragePooling2D,\
                          GlobalAveragePooling2D, ZeroPadding2D
from keras.initializers import glorot_uniform
from keras import regularizers
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import RMSprop, Adam, Adamax, Nadam, SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, \
                            classification_report

# Import PyDrive and associated libraries (to connect with GoogleDrive)
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# disable warnings
import warnings
warnings.simplefilter("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

### **Check if we are using GPU:**

In [28]:
from keras import backend as K
if K.backend() == "tensorflow":
    import tensorflow as tf
    device_name = tf.test.gpu_device_name()
    if device_name == '':
        device_name = "None"
    print('Using TensorFlow version:', tf.__version__, ', GPU:', device_name)

Using TensorFlow version: 1.15.0 , GPU: /device:GPU:0


### **Download Validation ('Control') patches from GoogleDrive:**

#### *Validation Patches were augmented with Patch_Generator, using 'stride=22' and then balanced by downsampling majority classes so we can compare accuracy of the model.*

###  **NOTE: Validation patches were generated from original, non-preprocessed images. In this way, we will ensure our model perform well at testing time when pre-processing may not be feasible. As example, being able to create masks/image annotation may not be feasible on testing data.**



In [4]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1fYVv6VwiotljBXOb2PAfWPCHVyugkbhJ' # Augmented and balanced pos_only

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
patches_zip = downloaded['title']

Downloaded content: "Control.zip"
Root dir content: ['.config', 'adc.json', 'Control.zip', 'sample_data']


### **Unzip the Validation ('Control') patches:**

In [5]:
# Remove 'Patches' dir if it already exists
if 'Control' in os.listdir():
  shutil.rmtree('./Control')
with zipfile.ZipFile(patches_zip,"r") as zip:
    zip.extractall()
os.remove(downloaded['title'])
print('Root dir content: {}'.format(os.listdir()))

Root dir content: ['.config', 'adc.json', 'Control', 'sample_data']


### **Let's count patches by type and class:**

In [6]:
classes = ['C1','C2-3','C4-7','C5','C6','C8','C9','C10']
val_type = 'Control'

type_pos, pos_total = 0, 0
print("\nTotal '{}' Patches per location:".format(val_type))
for cls in classes:
    folder = './{}/{}_pos'.format(val_type,cls)
    n_pos = len(os.listdir(folder))
    type_pos += n_pos
    print('total_{}: {}'.format(cls,n_pos))
print('Total {}: {}'.format(val_type,type_pos))


Total 'Control' Patches per location:
total_C1: 364
total_C2-3: 364
total_C4-7: 364
total_C5: 364
total_C6: 364
total_C8: 364
total_C9: 364
total_C10: 364
Total Control: 2912


#### **Let's build the validation generator, using keras.preprocessing.image.ImageDataGenerator, rescaling image pixel values from [0,  255] to [0, 1]:**

In [7]:
c1_pos_folder = './Control/C1_pos'
img = plt.imread(c1_pos_folder + '/' + os.listdir(c1_pos_folder)[:5][0])
img_size = img.shape
val_batch_size = 64

val_datagen = ImageDataGenerator(rescale=1./255)

val_generator = val_datagen.flow_from_directory(
        './Control',
        target_size=(img_size[0],img_size[1]),
        batch_size=val_batch_size,
        class_mode='categorical',
        shuffle=False)

Found 2912 images belonging to 8 classes.


#### **Let's check what is the data generators' index for each class:**

In [8]:
print('validation_generator.class_indices:', str(json.dumps(val_generator.class_indices, indent=2, default=str)))

validation_generator.class_indices: {
  "C10_pos": 0,
  "C1_pos": 1,
  "C2-3_pos": 2,
  "C4-7_pos": 3,
  "C5_pos": 4,
  "C6_pos": 5,
  "C8_pos": 6,
  "C9_pos": 7
}


### **Download the final '8_classes' model from GoogleDrive:**

In [9]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1w0u_EKaSG8zkMRtYkNjFd3IOnR3IpQsJ' # model_8_classes_0.8465

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
model_8_classes = downloaded['title']

Downloaded content: "model_8_classes_08465.h5"
Root dir content: ['.config', 'adc.json', 'model_8_classes_08465.h5', 'Control', 'sample_data']


#### **Let's evaluate '8 classes' model, on the validation set and compute relevant metrics:**

In [0]:
# load model:
eight_classes_model = load_model(model_8_classes)
#eight_classes_model.summary() # summarize model.

In [98]:
## Evaluate model on balanced validation patches:

y_true = val_generator.classes
Y_pred = eight_classes_model.predict_generator(val_generator)
y_pred = np.argmax(Y_pred, axis=1)

val_acc = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
class_names = [k for k in val_generator.class_indices]
c_report = classification_report(y_true, y_pred, target_names=class_names)

print('\nbalanced val_acc:\n', val_acc)
print('\nConfusion Matrix:\n', cm)
print('\nClassification Report:\n', c_report)


balanced val_acc:
 0.8464972527472527

Confusion Matrix:
 [[363   0   1   0   0   0   0   0]
 [  2 352   3   0   0   0   7   0]
 [  0 101 254   0   0   0   9   0]
 [  0   0   1 299  62   0   2   0]
 [  0   0   0 213 151   0   0   0]
 [  0   0   1   4   0 359   0   0]
 [  0   8   0  33   0   0 323   0]
 [  0   0   0   0   0   0   0 364]]

Classification Report:
               precision    recall  f1-score   support

     C10_pos       0.99      1.00      1.00       364
      C1_pos       0.76      0.97      0.85       364
    C2-3_pos       0.98      0.70      0.81       364
    C4-7_pos       0.54      0.82      0.65       364
      C5_pos       0.71      0.41      0.52       364
      C6_pos       1.00      0.99      0.99       364
      C8_pos       0.95      0.89      0.92       364
      C9_pos       1.00      1.00      1.00       364

    accuracy                           0.85      2912
   macro avg       0.87      0.85      0.84      2912
weighted avg       0.87      0.85      

### **Download the final 'C1_vs_C2-3_vs_all_other' model from GoogleDrive:**

In [29]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '18De1DbqyxD1JlNpue6LIUZXd73VgAN-f' # final model C1 vs C2-3 vs all_other

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
model_C1_C2_3 = downloaded['title']

Downloaded content: "model_C1_C2-3_08983.h5"
Root dir content: ['.config', 'model_C1_C2-3_08983.h5', 'model_C1_vs_C2-3_0888.h5', 'adc.json', 'model_C4-7_C5_083.h5', 'model_8_classes_08465.h5', 'Control', 'sample_data']


#### **Let's load and evaluate 'C1_vs_C2-3_vs_all_other' model, on the validation set and compute relevant metrics:**

In [0]:
# load model:
C1_C2_3_model = load_model(model_C1_C2_3)
#C1_C2_3_model.summary() # summarize model.

In [31]:
# class indices for C1_C2_3_model are as follows:
C1_C2_3_model_class_dict = {0:"C1_pos", 1:"C2-3_pos",2: "all_other_pos"}

scores = C1_C2_3_model.predict_generator(val_generator)

correct = 0
for i, img_path in enumerate(val_generator.filenames):
    if i >= len(scores):
        break
    if 'C1_pos' in img_path and np.argmax(scores[i]) == 0:
        correct += 1
    elif 'C2-3_pos' in img_path and np.argmax(scores[i]) == 1:
        correct += 1
    elif np.argmax(scores[i]) == 2:
        correct += 1

print("Correct:", correct, " Total: ", len(scores))
print('accuracy (non-balanced):', correct/len(scores))

Correct: 2789  Total:  2912
accuracy (non-balanced): 0.957760989010989


### **Download the final 'C4-7_vs_C5_vs_all_other' model from GoogleDrive:**

In [24]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1-4e6W-yR13q3ckpgo8O9QVMTncWITHwg' # model_C4-7_vs_C5_vs_all-other

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
model_C4_7_C5 = downloaded['title']

Downloaded content: "model_C4-7_C5_083.h5"
Root dir content: ['.config', 'model_C1_vs_C2-3_0888.h5', 'adc.json', 'model_C4-7_C5_083.h5', 'model_8_classes_08465.h5', 'Control', 'sample_data']



#### **Let's load and evaluate 'C4-7_vs_C5_vs_all_other' model, on the validation set and compute relevant metrics:**

In [0]:
# load model:
C4_7_C5_model = load_model(model_C4_7_C5)
#C4_7_C5_model.summary() # summarize model.

In [96]:
# class indices for C4-7_C5_model are as follows:

C4_7_C5_model_class_dict = {0:"C4-7_pos", 1:"C5_pos",2: "all_other_pos"}

scores = C4_7_C5_model.predict_generator(val_generator)

correct = 0
for i, img_path in enumerate(val_generator.filenames):
    if i >= len(scores):
        break
    if 'C4-7_pos' in img_path and np.argmax(scores[i]) == 0:
        correct += 1
    elif 'C5_pos' in img_path and np.argmax(scores[i]) == 1:
        correct += 1
    elif np.argmax(scores[i]) == 2:
        correct += 1

print("Correct:", correct, " Total: ", len(scores))
print('accuracy (non-balanced):', correct/len(scores))

Correct: 2660  Total:  2912
accuracy (non-balanced): 0.9134615384615384


### **Download Validation ('Control') patches for Positive vs Negative model:**

#### *Validation Patches were augmented with Patch_Generator, using 'stride=22' and then balanced by downsampling majority classes, then grouped all positive and all negative repsectively, then downsample again pos vs neg so we can compare accuracy of the Positive Vs Negative model.*

###  **NOTE: Validation patches were generated from original, non-preprocessed images. In this way, we will ensure our model perform well at testing time when pre-processing may not be feasible. As example, being able to create masks/image annotation may not be feasible on testing data.**

In [66]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1LDbG7uH1dl_lt5FlIZfe3IhDjDol23c9' # Augmented+ balanced pos+neg

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
patches_zip = downloaded['title']

Downloaded content: "Control _pos_neg.zip"
Root dir content: ['.config', 'model_C1_C2-3_08983.h5', 'model_C1_vs_C2-3_0888.h5', 'Control _pos_vs_neg_128_vs22_minval_1024_yes_cls_balance_yes_pos_neg_balance.zip', 'adc.json', 'model_C4-7_C5_083.h5', 'Control _pos_neg.zip', 'model_8_classes_08465.h5', 'Control', 'sample_data']


### **Unzip the Validation ('Control') pos vs neg patches:**

In [67]:
# Remove 'Patches' dir if it already exists
if 'Control _pos_neg' in os.listdir():
  shutil.rmtree('./Control _pos_neg')
with zipfile.ZipFile(patches_zip,"r") as zip:
    zip.extractall()
os.remove(downloaded['title'])
print('Root dir content: {}'.format(os.listdir()))

Root dir content: ['.config', 'model_C1_C2-3_08983.h5', 'model_C1_vs_C2-3_0888.h5', 'Control _pos_vs_neg_128_vs22_minval_1024_yes_cls_balance_yes_pos_neg_balance.zip', 'adc.json', 'model_C4-7_C5_083.h5', 'Control _pos_neg', 'model_8_classes_08465.h5', 'Control', 'sample_data']


### **Let's count patches by class:**

In [78]:
pos_folder = 'Control _pos_neg/Control/pos'
neg_folder = 'Control _pos_neg/Control/neg'
n_pos = len(os.listdir(pos_folder))
n_neg = len(os.listdir(neg_folder))
total = n_pos + n_neg
print('total: {} = {} positive + {} negative'.format(total,n_pos,n_neg))

total: 7420 = 3710 positive + 3710 negative


#### **Let's build the validation generator, using keras.preprocessing.image.ImageDataGenerator, rescaling image pixel values from [0,  255] to [0, 1]:**

In [79]:
img = plt.imread(pos_folder + '/' + os.listdir(pos_folder)[:5][0])
img_size = img.shape
val_batch_size = 64

pos_neg_val_datagen = ImageDataGenerator(rescale=1./255)

pos_neg_val_generator = pos_neg_val_datagen.flow_from_directory(
        'Control _pos_neg/Control',
        target_size=(img_size[0],img_size[1]),
        batch_size=val_batch_size,
        class_mode='categorical',
        shuffle=False)

Found 7420 images belonging to 2 classes.


#### **Let's check what is the data generators' index for each class:**

In [82]:
print('validation_generator.class_indices:', str(json.dumps(pos_neg_val_generator.class_indices, indent=2, default=str)))

validation_generator.class_indices: {
  "neg": 0,
  "pos": 1
}


### **Download the final 'Positive vs Negative' model from GoogleDrive:**

In [93]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1-BxPnguFXE7PHmzKadW0AnwWO9VqTywR' # model pos-neg

downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(downloaded['title'])
print('Downloaded content: "{}"'.format(downloaded['title']))
print('Root dir content: {}'.format(os.listdir()))
pos_vs_neg = downloaded['title']

Downloaded content: "model_pos_neg_2.h5"
Root dir content: ['.config', 'model_C1_C2-3_08983.h5', 'model_C1_vs_C2-3_0888.h5', 'model_pos_neg_2.h5', 'adc.json', 'model_C4-7_C5_083.h5', 'Control _pos_neg', 'model_8_classes_08465.h5', 'model_pos_neg.h5', 'Control', 'sample_data']


#### **Let's load and evaluate 'Positive vs Negative' model, on the validation set and compute relevant metrics:**

In [0]:
# load model:
pos_vs_neg_model = load_model(pos_vs_neg)
#pos_vs_neg_C5_model.summary() # summarize model.

In [97]:
## Evaluate model on balanced validation patches:

y_true = pos_neg_val_generator.classes
Y_pred = pos_vs_neg_model.predict_generator(pos_neg_val_generator)
y_pred = np.argmax(Y_pred, axis=1)

val_acc = accuracy_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
class_names = [k for k in pos_neg_val_generator.class_indices]
c_report = classification_report(y_true, y_pred, target_names=class_names)

print('\nbalanced val_acc:\n', val_acc)
print('\nConfusion Matrix:\n', cm)
print('\nClassification Report:\n', c_report)


balanced val_acc:
 0.9973045822102425

Confusion Matrix:
 [[3690   20]
 [   0 3710]]

Classification Report:
               precision    recall  f1-score   support

         neg       1.00      0.99      1.00      3710
         pos       0.99      1.00      1.00      3710

    accuracy                           1.00      7420
   macro avg       1.00      1.00      1.00      7420
weighted avg       1.00      1.00      1.00      7420

