Packages

In [1]:
from os import listdir
import matplotlib.pyplot as plt
from imutils import paths
import cv2
import os
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical
import pandas as pd
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import numpy as np
import tensorflow as tf

Data preparation

In [3]:
# Look at data folder structure
classes_dir = listdir("../data/Kather_texture_2016_image_tiles_5000")
classes_dir

['.DS_Store',
 '01_TUMOR',
 '02_STROMA',
 '03_COMPLEX',
 '04_LYMPHO',
 '05_DEBRIS',
 '06_MUCOSA',
 '07_ADIPOSE',
 '08_EMPTY']

In [None]:
# Check files
files = listdir("data/Kather_texture_2016_image_tiles_5000/01_TUMOR")
for n in range(5):
    print(files[n])

In [None]:
# visualize example image
file = cv2.imread("data/Kather_texture_2016_image_tiles_5000/01_TUMOR/1A11_CRC-Prim-HE-07_022.tif_Row_601_Col_151.tif")
# file = cv2.cvtColor(file, cv2.COLOR_BGR2RGB)
# file = cv2.resize(file, (128, 128))
plt.imshow(file)

In [None]:
# Load data
print("[INFO] loading images...")
imagePaths = list(paths.list_images("data/Kather_texture_2016_image_tiles_5000"))
data = []
labels = []
# loop over the image paths
for imagePath in imagePaths:
    # extract the class label from the filename
    label = (imagePath.split(os.path.sep)[-2][1])
    # Since we are going to use MobileNetV2 we need to resize the images
    # to the expected size by the pre-trained network.
    image = load_img(imagePath, target_size=(150, 150))
    image = img_to_array(image)
    image = preprocess_input(image)
    image = tf.cast(image, tf.float32) / 255.0 #normalize
    data.append(image)
    labels.append(label)
# convert the data and labels to NumPy arrays
data = np.array(data, dtype="float32")
labels = np.array(labels)

In [None]:
data.shape

In [None]:
data[0].shape

In [None]:
print(labels[10], labels[2500], labels[4500])

In [None]:
data[0]

In [None]:
plt.imshow(data[0]*255) # * 255 because of previous normalization

In [None]:
# perform one-hot encoding on the labels
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
# labels = to_categorical(labels)
print(labels[0], labels[0][0], type(labels[0]))

In [None]:
print(type(labels[0]))

In [None]:
# index list
index = [x for x in range(0, len(labels))]
print(len(index), index[-1])

In [None]:
# Split of data into train and test
# (trainX, testX, trainY, testY, train_index, test_index) = train_test_split(data, labels, index, test_size=0.20, stratify=labels, random_state=42)

In [None]:
# Split into train, val, test set
x, x_test, y, y_test = train_test_split(data, labels, test_size=0.2,train_size=0.8)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25,train_size =0.75)

Checking if shuffeled correctly

In [None]:
# plt.imshow(trainX[1983])

In [None]:
# print(trainY[1983], "\n", train_index[1983])

In [None]:
# plt.imshow(data[3746])

In [None]:
# print(labels[3746],"\n", labels[3746] == trainY[1983])

In [None]:
# plt.imshow(testX[800])

In [None]:
# print(testY[800], "\n", test_index[800])

In [None]:
# plt.imshow(data[4823])

In [None]:
# print(labels[4832],"\n", labels[4832] == trainY[800])

In [None]:
# trainY[0]

In [None]:
# trainX_df = pd.DataFrame(trainX)
# trainY_df = pd.DataFrame(trainY)
# # testX_series = pd.Series(testX)
# # testY_series = pd.Series(testY)
# print(trainY_df.head())

In [None]:
# # Saving datasets
# training_set = pd.concat([pd.Series(trainX), pd.Series(trainY), pd.Series(train_index)], axis=1)
# test_set = pd.concat([pd.Series(testX), pd.Series(testX), pd.Series(test_index)], axis=1)
#
# training_set.to_csv('training.csv', index=False)
# test_set.to_csv('test.csv', index=False)

Checking shape

In [None]:
print('1- Training set:', x_train.shape, y_train.shape)
print('2- Validation set:', x_val.shape, y_val.shape)
print('3- Testing set:', x_test.shape, y_test.shape)

Initial network

In [None]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import shutil

In [None]:
tf.keras.backend.clear_session()

In [None]:
INIT_LR = 1e-4 # learning rate
EPOCHS = 200
BS = 256

In [None]:
def model_cnn_v1():
    # must define the input shape in the first layer of the neural network
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(32, 3, 3, input_shape= (150,150,3)))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Conv2D(64, 3, 3))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(8)) # 1 output value for corresponding class
    model.add(tf.keras.layers.Activation('sigmoid'))
    model.compile(loss='categorical_crossentropy', # categorical_crossentropy for two or more label classes
    optimizer='adam',
    metrics=['accuracy'])
    return model

In [None]:
model=model_cnn_v1()
model.summary()

In [None]:
# initial weights
initial_weights = model.get_weights()
model.set_weights(initial_weights)

In [None]:
# define path to save the mnodel
path_model='model_cnn_v1.weights.best.hdf5'
shutil.rmtree(path_model, ignore_errors=True)
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath=path_model, verbose = 1,save_best_only=True)

In [None]:
history=model.fit(x_train,
                y_train,
                batch_size=BS,
                epochs=EPOCHS,
                validation_data=(x_val, y_val),
                callbacks=[checkpointer])

In [None]:
# Visualize accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(200) #adjust epoch range if needed

plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
aug = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest")

In [None]:
#clear session
tf.keras.backend.clear_session()
# Model with data augmentation
history2 =model.fit(aug.flow(x_train, y_train, batch_size=BS),
                epochs=EPOCHS,
                validation_data=(x_val, y_val),
                callbacks=[checkpointer])

#additionally possible
# steps_per_epoch=len(x_train) // BS,
# validation_steps=len(x_val) // BS,

In [None]:
# Visualize accuracy
acc = history2.history['accuracy']
val_acc = history2.history['val_accuracy']

loss = history2.history['loss']
val_loss = history2.history['val_loss']

epochs_range = range(200) #adjust epoch range if needed

plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
# opt = Adam(learning_rate=INIT_LR, weight_decay=INIT_LR / EPOCHS)
# model.compile(loss="binary_crossentropy", optimizer=opt,
#     metrics=["accuracy"])
# H = model.fit(
#     aug.flow(x_train, y_train, batch_size=BS),
#     steps_per_epoch=len(x_train) // BS,
#     validation_data=(x_val, y_val),
#     validation_steps=len(x_val) // BS,
#     epochs=EPOCHS)

In [None]:
print("[INFO] evaluating network...")
predIdxs = model.predict(x_test, batch_size=BS, verbose=0)
predIdxs = np.argmax(predIdxs, axis=1)
print(classification_report(np.argmax(y_test, axis=1), predIdxs, # labels transformed back to single unit
    target_names=lb.classes_))
model.save("model_cancer_detection_cnn_v1", save_format="h5")

In [None]:
# Precision-recall (PR) curve plots the
# precision = TP / (TP + FP) vs.
# the recall (true-positive rate) = TP / (TP + FN) for various threshold. The higher the better.

In [None]:
cancer_image = [x for x in predIdxs]

In [None]:
plt.imshow(x_test[0]*255)