In [1]:
# Importing necessary library

import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf
import random
import shutil
import pandas as pd
import PIL.Image as Image
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
from numpy.random import seed

# set seed
seed(2)
tf.random.set_seed(3)
random.seed(0)

# set supported tf operations to be deterministic on gpu
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [2]:
print(tf.__version__)
print(np.__version__)

2.4.1
1.19.5


1. Copy training files to tmp
2. Move all images under LUAD, LUSC, MESO from their subdirs to the parent folder, respectively..

    For example, move all files under train/LUAD/TCGA-05-4249-01Z-00-DX1.9fce0297-cc19-4c04-872c-4466ee4024db to train/LUAD.
    
3. Delete those subdirs

In [3]:
!mkdir /kaggle/tmp/

In [4]:
cp -r ../input/histopathology-dataset/train /kaggle/tmp/

In [5]:
%cd /kaggle/tmp/train
!ls -l

/kaggle/tmp/train
total 36
drwxr-xr-x  74 root root 12288 Apr 25 17:03 LUAD
drwxr-xr-x 158 root root 20480 Apr 25 17:04 LUSC
drwxr-xr-x  11 root root  4096 Apr 25 17:04 MESO


In [6]:
%cd /kaggle/tmp/train/LUAD
!find . -mindepth 2 -type f -print -exec mv {} . \;
!rm -R -- */

/kaggle/tmp/train/LUAD
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_12.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_3.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_16.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_14.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_1.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_13.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64/TCGA-44-6774-01Z-00-DX1.f169485b-f863-4be0-9844-258d78170b64_20x_2.jpg
./TCGA-44-6774-01Z-00-DX1.f169485b-

In [7]:
%cd /kaggle/tmp/train/LUSC
!find . -mindepth 2 -type f -print -exec mv {} . \;
!rm -R -- */

/kaggle/tmp/train/LUSC
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_12.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_3.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_5.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_9.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_6.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_4.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78/TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24c-4b2b-bb8f-13cc6991fb78_20x_1.jpg
./TCGA-21-1077-01Z-00-DX1.e7d0d3ca-b24

In [8]:
%cd /kaggle/tmp/train/MESO
!find . -mindepth 2 -type f -print -exec mv {} . \;
!rm -R -- */

/kaggle/tmp/train/MESO
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_6.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_7.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_2.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_1.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_4.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_3.jpg
./TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0/TCGA-3H-AB3U-01Z-00-DX2.E66C5B39-8F58-44E0-BC1B-5688CF0B03A0_20x_5.jpg
./TCGA-3H-AB3S-01Z-00-DX2.943D90C4-82AC

Go back to the original working directory

In [9]:
%cd /kaggle/working

/kaggle/working


Load the data into trainning set and validation set (30%).  

Enable AUTOTUNE prefetching to improve performance.

Note: change batch_size to a smaller number if you are seeing OOM. However, it might be impacting model performance if you use a much smaller Batch_size.

Data is unbalanced

In [10]:
!ls -l /kaggle/tmp/train/MESO | wc -l
!ls -l /kaggle/tmp/train/LUAD | wc -l
!ls -l /kaggle/tmp/train/LUSC | wc -l

404
5559
16444


In [11]:
BATCH_SIZE = 32
IMG_SIZE = (512, 512)
train_dir = "/kaggle/tmp/train/"

train_dataset = image_dataset_from_directory(
    train_dir,
    validation_split=0.3,
    subset="training",
    seed=1338,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

validation_dataset = image_dataset_from_directory(
    train_dir,
    validation_split=0.3,
    subset="validation",
    seed=1338,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE
)

class_names = np.array(train_dataset.class_names)
print(class_names)

AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)

Found 22404 files belonging to 3 classes.
Using 15683 files for training.
Found 22404 files belonging to 3 classes.
Using 6721 files for validation.
['LUAD' 'LUSC' 'MESO']


Use data augmentation since our dataset is pretty small and inbalanced. We want to introduce some sample diversity. Each epoch will train our model using different generated images, which greatly increases our training dataset.

In [12]:
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal_and_vertical', seed=1337),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2, seed=1337),
])

Create the base model from the pre-trained MobileNet v2. I picked MobileNet because it's lightweight but efficient. 

Build a model by concatenating the data augmentation, rescaling and feature extractor layer. Add a dropout layer to reduce overfitting.

In [13]:
preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input
IMG_SHAPE = IMG_SIZE + (3,)
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE, include_top=False, weights='imagenet')

base_model.trainable = False

# Add a classification head
image_batch, label_batch = next(iter(train_dataset))
feature_batch = base_model(image_batch)
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
feature_batch_average = global_average_layer(feature_batch)

# Predict to 3 classes
prediction_layer = tf.keras.layers.Dense(3)
prediction_batch = prediction_layer(feature_batch_average)

# Build the model
inputs = tf.keras.Input(shape=(512, 512, 3))
x = data_augmentation(inputs)
x = preprocess_input(x)

# The base model contains batchnorm layers. We want to keep them in inference mode
# when we unfreeze the base model for fine-tuning, so we make sure that the
# base_model is running in inference mode here by setting training=False

x = base_model(x, training=False)
x = global_average_layer(x)
x = tf.keras.layers.Dropout(0.2, seed=1234)(x)
outputs = prediction_layer(x)
model = tf.keras.Model(inputs, outputs)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5


Compile the model

In [14]:
base_learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['acc'])

Train the model with cross-entropy.  
Since our data is inbalanced, I am assigning higher weights for LUAD and MESO.

Note: At the beginning, we are only training a few layers on top of the MobileNet v2, since we want to evaluate how suitable this pre-trained network is for our data. We will do fine-tuning later if the preliminary result looks OK.

In [15]:
initial_epochs = 3

history = model.fit(train_dataset,
                    epochs=initial_epochs,
                    validation_data=validation_dataset,class_weight = {0:6, 1:1, 2:10}) # based on imbalanced data

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
base_model.trainable = True
fine_tune_at = 100

for layer in base_model.layers[:fine_tune_at]:
    layer.trainable =  False

# Use a much smaller learning rate to avoid overfitting.
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.RMSprop(lr=base_learning_rate/10),
              metrics=['acc'])

total_epochs = 6

learning_rate_reduction = ReduceLROnPlateau(monitor="val_loss", patience = 5, verbose=1,factor=0.8, min_lr=0.000005)
callbacks_list = [learning_rate_reduction]

history_fine = model.fit(train_dataset,
                         epochs=total_epochs,
                         initial_epoch=history.epoch[-1], callbacks=callbacks_list, validation_data=validation_dataset, class_weight = {0:5, 1:1, 2:30})


Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [17]:
def predict_image(image_path):

    image = load_img(image_path)
    image = np.array(image)
    prediction = model.predict(image[np.newaxis,...]).flatten()
    #print(prediction)
    predicted_id = np.argmax(prediction, axis=-1)
    return class_names[predicted_id]

In [18]:
from pathlib import Path

def get_all_subdirs(path):
    
    p = Path(path)
    
    return [str(dir) for dir in p.iterdir() if dir.is_dir()]

def get_all_files(path):
    
    p = Path(path)
    
    return [str(f) for f in p.iterdir() if f.is_file()]


In [19]:
from collections import Counter
def check_accuracy(path, correct_label):
    '''
    path should be somethinglike ../input/histopathology-dataset/dev/LUAD
    '''
    
    all_patch_set = get_all_subdirs(path)
    total_sets = len(all_patch_set)
    correct_prediction = 0
    
    for patch_set in all_patch_set:
        counter = Counter()
        images = get_all_files(patch_set)
        for image in images:
            prediction = predict_image(image)
            #print(prediction)
            counter[prediction] += 1
        major_vote = counter.most_common(1)[0][0] # TODO: Handle tie scenario
        if major_vote == correct_label:
            correct_prediction += 1
    
    return correct_prediction / total_sets
        
                
        

In [20]:
print(check_accuracy("../input/histopathology-dataset/dev/LUAD", 'LUAD'))
# Scoll down has the accuracy

0.15789473684210525


In [21]:
print(check_accuracy("../input/histopathology-dataset/dev/LUSC", 'LUSC'))

0.9512195121951219


In [22]:
print(check_accuracy("../input/histopathology-dataset/dev/MESO", 'MESO'))

0.0


In [23]:
print(check_accuracy("../input/histopathology-dataset/dev/LUAD", 'LUAD'))

0.15789473684210525


Reference: https://www.tensorflow.org/tutorials/images/transfer_learning