<a href="https://colab.research.google.com/github/clyde2020/ML_Portfolio/blob/main/Histopathologic_Cancer_Detection/HCD_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import os
import shutil
from shutil import copyfile
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop, Adam
import matplotlib.pyplot as plt
import random
!pip install keras-tuner -q
from keras_tuner import RandomSearch
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
import pickle
from keras.models import load_model
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from keras.callbacks import ReduceLROnPlateau
import cv2

from google.colab import drive
drive.mount('/content/gdrive')

[?25l[K     |███▍                            | 10 kB 30.5 MB/s eta 0:00:01[K     |██████▊                         | 20 kB 36.3 MB/s eta 0:00:01[K     |██████████                      | 30 kB 12.4 MB/s eta 0:00:01[K     |█████████████▍                  | 40 kB 9.3 MB/s eta 0:00:01[K     |████████████████▊               | 51 kB 5.7 MB/s eta 0:00:01[K     |████████████████████            | 61 kB 5.7 MB/s eta 0:00:01[K     |███████████████████████▍        | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████████████████████▊     | 81 kB 6.3 MB/s eta 0:00:01[K     |██████████████████████████████  | 92 kB 6.5 MB/s eta 0:00:01[K     |████████████████████████████████| 98 kB 3.7 MB/s 
[?25hMounted at /content/gdrive


## Initialize variables

In [None]:
initial_dir = '/content/gdrive/MyDrive/Histopath'  # Mandatory update

def append_ext(file):
  return file + '.tif'

def remove_ext(file):
  return file.split('.')[0]

df = pd.read_csv('{}/train_labels.csv'.format(initial_dir))
state = 60
target_size = (96, 96)
class_mode = 'binary'
source_folder = '{}/train'.format(initial_dir)
home_dir = '{}/Full_set'.format(initial_dir)

In [None]:
tif_source_list = os.listdir(source_folder)

## View image

In [None]:
def get_image(path):
  im_bgr = cv2.imread(path)
  im_rgb = cv2.cvtColor(im_bgr, cv2.COLOR_BGR2RGB)
  return im_rgb

In [None]:
path = '/content/gdrive/MyDrive/Satellite/data/desert/desert(58).jpg'  # Mandatory update
new_image = get_image(path=path)
plt.imshow(new_image)
plt.show()
print(new_image.shape)

## Process data

In [None]:
source_list = map(remove_ext, tif_source_list)
picdict = dict(zip(df['id'], df['label']))

How many of each class are represented?

In [None]:
df['label'].value_counts()

0    130908
1     89117
Name: label, dtype: int64

In [None]:
sample_size = 80000 

# Sample a number of each label type
df_0_train = df[df['label'] == 0].sample(sample_size)
df_1_train = df[df['label'] == 1].sample(sample_size)

# Put into single dataframe
df_train_full = pd.concat([df_0_train, df_1_train], axis=0).reset_index(drop=True)

# Add the extension
df_train_full['id'] = df_train_full['id'].apply(append_ext)

# Shuffle the dataframe
df_train_full = shuffle(df_train_full)

In [None]:
# Split into train, valid split lists
df_train, df_valid = train_test_split(df_train_full, 
                                      test_size=10000, 
                                      random_state=state, 
                                      stratify=df_train_full['label'])
train_list = list(df_train['id'])
valid_list = list(df_valid['id'])
# train_pre_list = list(df_train['id'])
# valid_pre_list = list(df_valid['id'])

# Get id lists
# df_source = pd.DataFrame(tif_source_list, columns=['id'])
# df_train_list = df_source[df_source.id.isin(train_pre_list)]
# df_valid_list = df_source[df_source.id.isin(valid_pre_list)]

# train_list = list(df_train_list['id'])
# valid_list = list(df_valid_list['id'])

# Make sure lists have correct # of elements
print(len(train_list), len(valid_list))
assert (len(train_list) + len(valid_list)) == (2 * sample_size)

150000 10000


Set data folder variables

In [None]:
# Update these variables according to the dataset to use
train_comp = '{}/Train'.format(home_dir)
valid_comp = '{}/Valid'.format(home_dir)
benign_train_comp = '{}/Benign'.format(train_comp)
benign_valid_comp = '{}/Benign'.format(valid_comp)
mal_train_comp = '{}/Malignant'.format(train_comp)
mal_valid_comp = '{}/Malignant'.format(valid_comp)

Folder creation before split data

In [None]:
os.mkdir(home_dir)
os.mkdir(train_comp)
os.mkdir(valid_comp)
os.mkdir(benign_train_comp)
os.mkdir(benign_valid_comp)
os.mkdir(mal_train_comp)
os.mkdir(mal_valid_comp)

This function splits the dataset into malignant and benign

In [None]:
def split_data(LIST, SOURCE, MAL_FOLDER, BENIGN_FOLDER):
  for fname in LIST:
    origin = '{}/{}'.format(SOURCE, fname)
    name = remove_ext(fname)
    if picdict[name] == 0:
      shutil.copy(src=origin, dst=BENIGN_FOLDER)
    else:
      shutil.copy(src=origin, dst=MAL_FOLDER)

Execute the data split

In [None]:
split_data(LIST=valid_list, SOURCE=source_folder, MAL_FOLDER=mal_valid_comp, BENIGN_FOLDER=benign_valid_comp)
split_data(LIST=train_list, SOURCE=source_folder, MAL_FOLDER=mal_train_comp, BENIGN_FOLDER=benign_train_comp)

This file was found to be corrupt during training

In [None]:
bad_file = f'{mal_train_comp}/cd0e59e19393ad3545664a31b149f15ef2f909c2.tif'
os.remove(bad_file)

Data Augmentation

In [None]:
batch_size = 32

train_datagen = ImageDataGenerator(rescale=1/255.0,
                                  rotation_range=45,
                                  width_shift_range=0.2,
                                  height_shift_range=0.2,
                                  shear_range=0.2,
                                  horizontal_flip=True,
                                  fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
    train_comp,
    target_size=(96, 96),
    batch_size=batch_size,
    class_mode='binary')

validation_datagen = ImageDataGenerator(rescale=1/255.0)  

validation_generator = validation_datagen.flow_from_directory(
    valid_comp,
    target_size=(96, 96),
    batch_size=batch_size,
    class_mode='binary') 

# Define and execute the model

In [None]:
state = 75

tf.keras.backend.clear_session()
tf.random.set_seed(state)

model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=32, 
                            kernel_size=3, 
                            input_shape=(96, 96, 3), 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=32, 
                            kernel_size=3, 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.MaxPooling2D(2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64, 
                            kernel_size=3, 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=64, 
                            kernel_size=3, 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.MaxPooling2D(2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128, 
                            kernel_size=3, 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Conv2D(filters=128, 
                            kernel_size=3, 
                            activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.MaxPooling2D(2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.ReLU(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-3)

model.compile(optimizer=optimizer, 
              loss='binary_crossentropy', 
              metrics=['acc']
              )

reduce_lr = ReduceLROnPlateau(monitor='acc',
                              factor=0.5,
                              patience=1,
                              cooldown=2,
                              min_lr=1e-4
                              )

mc = ModelCheckpoint(monitor='val_acc', 
                     filepath='/content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5', 
                     verbose=1, 
                     save_best_only=True, 
                     mode='max'
                     )

cd = [mc, reduce_lr]

In [None]:
history = model.fit(train_generator,
                    epochs=20,
                    verbose=1,
                    validation_data=validation_generator,
                    callbacks=cd
                    )

Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.66470, saving model to /content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5
Epoch 2/20
Epoch 00002: val_acc improved from 0.66470 to 0.69300, saving model to /content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5
Epoch 3/20
Epoch 00003: val_acc did not improve from 0.69300
Epoch 4/20
Epoch 00004: val_acc improved from 0.69300 to 0.73830, saving model to /content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5
Epoch 5/20
Epoch 00005: val_acc improved from 0.73830 to 0.84750, saving model to /content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5
Epoch 6/20
Epoch 00006: val_acc did not improve from 0.84750
Epoch 7/20
Epoch 00007: val_acc improved from 0.84750 to 0.88590, saving model to /content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5
Epoch 8/20
Epoch 00008: val_acc improved from 0.88590 to 0.90170, saving model to /content/gdrive/MyD

# Get Predictions and create Submission CSV file

In [None]:
model = load_model('/content/gdrive/MyDrive/Histopath/Saved_models/6Layer_bestmodel_011722.h5')
y_pred = pd.read_csv('/content/gdrive/MyDrive/Histopath/blank_sample_submission.csv')
test_comp = '/content/gdrive/MyDrive/Histopath/test'
y_pred['id'] = y_pred['id'].apply(append_ext)

In [None]:
# test_files = pd.DataFrame({'file_name': os.listdir(test_comp)})
# with open('/content/gdrive/MyDrive/Histopath/full_test_files', 'wb') as f:
#   pickle.dump(test_files, f)

with open('/content/gdrive/MyDrive/Histopath/Saved_data/full_test_files', 'rb') as f:
  test_files = pickle.load(f)

In [None]:
test_datagen = ImageDataGenerator(rescale=1/255.0)  

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_files,
    directory=test_comp,
    x_col='file_name',
    target_size=(96, 96),
    batch_size=1,
    class_mode=None,
    shuffle=False) 

Found 57458 validated image filenames.


In [None]:
len_test = len(test_files)
len_test

57458

In [None]:
preds = model.predict(test_generator,
                      steps=len_test,
                      verbose=1)



Process the prediction labels

In [None]:
preds = preds[:, 0]
preds = np.round(preds)
preds.astype(int)

array([0, 1, 1, ..., 0, 0, 1])

Merge the prediction labels into the submission dataframe

In [None]:
test_file_names = test_generator.filenames
df_preds = pd.DataFrame({'id': test_file_names, 'label': preds})
y_pred = y_pred.merge(df_preds, on='id')

Get submission dataframe into correct format

In [None]:
y_pred['label'] = y_pred['label_y']
y_pred.drop(['label_x'], axis=1, inplace=True)
y_pred.drop(['label_y'], axis=1, inplace=True)
y_pred['id'] = y_pred['id'].apply(remove_ext)

In [None]:
y_pred.to_csv('/content/gdrive/MyDrive/Histopath/sample_submission.csv', index=False)

# Get AUC

In [None]:
val_test_generator = validation_datagen.flow_from_directory(
    valid_comp,
    target_size=(96, 96),
    batch_size=1,
    class_mode='binary',
    shuffle=False) 

Found 10000 images belonging to 2 classes.


In [None]:
len_val_test = len(val_test_generator)

val_test_pred = model.predict(val_test_generator, 
                         steps=len_val_test,
                         verbose=1)[:, 0]



In [None]:
val_test_true = val_test_generator.classes
val_test_pred = np.round(val_test_pred)
val_test_pred.astype(int)

array([0, 0, 0, ..., 1, 1, 0])

In [None]:
acc_val_test = np.equal(val_test_pred, val_test_true).sum() / len(val_test_pred)
print('Validation accuracy: {:.3f}'.format(acc_val_test))

Validation accuracy: 0.916


In [None]:
fpr, tpr, thresholds = roc_curve(val_test_true, val_test_pred)

In [None]:
auc_val_test = auc(fpr, tpr)
print('Validation AUC: {:.3f}'.format(auc_val_test))

Validation AUC: 0.916
