# Skin Lesion Classifier

## Google Colab

In [None]:
# Run this cell to mount Google Drive for Colab
from google.colab import drive
drive.mount('/content/drive/')
# !ls '/content/drive/My Drive/Colab Notebooks'

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/isic-2019')

In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_Input.zip' '/home/ISIC_2019_Training_Input.zip'
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_GroundTruth.csv' '/home/ISIC_2019_Training_GroundTruth.csv'
# !unzip -qq '/home/ISIC_2019_Training_Input.zip' -d '/home'

## Environment

### Install Python Packages

In [None]:
# !pip3 install -r requirements.txt

### Check whether you’re running Pillow or Pillow-SIMD?

In [None]:
# According to the author, if PILLOW_VERSION has a postfix, it is Pillow-SIMD0.
# (Assuming that Pillow will never make a .postX release).
!python -c "from PIL import Image; print(Image.PILLOW_VERSION)"

### Whether Pillow or Pillow-SIMD is using libjpeg-turbo?

In [None]:
from PIL import features, Image
from packaging import version

if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"):
    if features.check_feature('libjpeg_turbo'):
        print("libjpeg-turbo is on")
    else:
        print("libjpeg-turbo is not on")
else:
    print("libjpeg-turbo' status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {}".format(Image.PILLOW_VERSION))

### System Information

In [None]:
import tensorflow as tf
import platform
from tensorflow.python.client import device_lib

!python3 --version

print('\nTensorFlow Version: ', tf.VERSION)

print('\nNVIDIA:')
!nvcc --version
# !nvidia-smi

print('\nCPU:')
!lscpu

print('\nOS:')
print(platform.platform())

print('\nDevices:')
print(device_lib.list_local_devices())

## Import Training Data

In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
%matplotlib inline

# dermoscopic images folder path
data_folder = 'C:\ISIC_2019'
# data_folder = '/home'
# data_folder = '/home/jupyter'
derm_image_folder = os.path.join(data_folder, 'ISIC_2019_Training_Input')
df_ground_truth = pd.read_csv(os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth.csv'))

# Category names not include UNK
category_names = list(df_ground_truth.columns.values[1:9])
known_category_num = len(category_names)
print("Number of known categories: {}".format(known_category_num))
print(category_names, '\n')

# mapping from category to index
print('Category to Index:')
category_to_index = dict((c, i) for i, c in enumerate(category_names))
print(category_to_index, '\n')

df_ground_truth['path'] = df_ground_truth.apply(lambda row : os.path.join(derm_image_folder, row['image']+'.jpg'), axis=1)
df_ground_truth['category'] = pd.Series([np.argmax(x) for x in np.array(df_ground_truth.iloc[:,1:9])], name='category')
count_per_category = Counter(df_ground_truth['category'])
total_sample_count = sum(count_per_category.values())
print("Original training data has {} samples.".format(total_sample_count))

for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category[i], count_per_category[i]*100/total_sample_count))

fig = plt.bar(count_per_category.keys(), count_per_category.values())

df_ground_truth.head()

### Shuffle and Split Original Training Data into Training  and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split

seed = 1
df_train, df_val = train_test_split(df_ground_truth, stratify=df_ground_truth['category'], test_size=0.2, random_state=seed)

sample_count_train = df_train.shape[0]
print("Training set has {} samples.".format(sample_count_train))
count_per_category_train = Counter(df_train['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_train[i], count_per_category_train[i]*100/sample_count_train))
    
sample_count_val = df_val.shape[0]
print("\nValidation set has {} samples.".format(sample_count_val))
count_per_category_val = Counter(df_val['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_val[i], count_per_category_val[i]*100/sample_count_val))

### Class Weights based on the Traning Set

In [None]:
from sklearn.utils import class_weight

# Compute class weights for imbalanced data
class_weights = class_weight.compute_class_weight('balanced', np.unique(df_train['category']), df_train['category'])
class_weight_dict = dict(enumerate(class_weights))
# class_weight_dict = dict(zip(category_names, class_weights))
print('Class Weights:')
print(class_weight_dict)

### Samples of each Category

In [None]:
from IPython.display import Image

category_groups = df_train.groupby('category')

# Number of samples for each category
num_per_category = 3

fig, axes = plt.subplots(nrows=known_category_num, ncols=num_per_category, figsize=(9, 24))
plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
fig.patch.set_facecolor('white')

for idx, val in enumerate(category_names):
    i = 0
    for index, row in category_groups.get_group(idx).head(num_per_category).iterrows():
        ax = axes[idx, i]
        ax.imshow(plt.imread(row['path']))
        ax.set_xlabel(row['image'])
        if ax.is_first_col():
            ax.set_ylabel(val, fontsize=20)
            ax.yaxis.label.set_color('blue')
        i += 1
    
fig.tight_layout()

## Pre-process the Data

In [None]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path, size=(224, 224)):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=size)
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths, size=(224, 224)):
    list_of_tensors = [path_to_tensor(img_path, size) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

## Evaluation Metrics

In [None]:
# Ref: https://stackoverflow.com/a/54620037/2437361
import keras.backend as K

def balanced_accuracy(y_true, y_pred):
    """
    Computes the average per-class recall metric for a multi-class classification problem
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)  
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)   
    recall = true_positives / (possible_positives + K.epsilon())
    balanced_recall = K.mean(recall)
    return balanced_recall

## Create a vanilla CNN as benchmark model

### Image Augmentation Pipeline

In [None]:
from image_iterator import ImageIterator
from Augmentor import Pipeline
from Augmentor.Operations import CropPercentageRange
from keras import backend as K
from keras.utils import np_utils

input_size = (224, 224)
batch_size = 40
data_format = K.image_data_format()

### Training Data Generator
#TODO Maybe remove black borders
vanilla_p_train = Pipeline()
# Random crop
vanilla_p_train.add_operation(CropPercentageRange(probability=1, min_percentage_area=0.8, max_percentage_area=1, centre=False))
# Rotate an image by either 90, 180, or 270 degrees randomly
vanilla_p_train.rotate_random_90(probability=0.5)
# Resize an image
vanilla_p_train.resize(probability=1, width=input_size[0], height=input_size[1])
# Flip the image along its vertical axis
vanilla_p_train.flip_top_bottom(probability=0.5)
# Flip the image along its horizontal axis
vanilla_p_train.flip_left_right(probability=0.5)
# Random change brightness of an image
vanilla_p_train.random_brightness(probability=0.5, min_factor=0.9, max_factor=1.1)
# Random change saturation of an image
vanilla_p_train.random_color(probability=0.5, min_factor=0.9, max_factor=1.1)
# Set the seed
vanilla_p_train.set_seed(seed)
vanilla_p_train.status()

generator_train = ImageIterator(
    image_paths=df_train['path'].tolist(),
    labels=np_utils.to_categorical(df_train['category'], num_classes=known_category_num),
    augmentation_pipeline=vanilla_p_train,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    rescale=1./255,
    data_format=data_format
)


### Validation Data Generator
vanilla_p_val = Pipeline()
# Resize an image
vanilla_p_val.resize(probability=1, width=input_size[0], height=input_size[1])
# Set the seed
vanilla_p_val.set_seed(seed)
vanilla_p_val.status()

generator_val = ImageIterator(
    image_paths=df_val['path'].tolist(),
    labels=np_utils.to_categorical(df_val['category'], num_classes=known_category_num),
    augmentation_pipeline=vanilla_p_val,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    rescale=1./255,
    data_format=data_format
)

In [None]:
# from keras.preprocessing.image import ImageDataGenerator
# from keras import backend as K
# from Augmentor import DataFramePipeline
# from Augmentor.Operations import CropPercentageRange

# input_size = (224, 224)
# batch_size = 40

# ### Training Data Generator
# #TODO Maybe remove black borders
# vanilla_p_train = DataFramePipeline(source_dataframe=df_train, image_col='path', category_col='category')
# # Random crop
# vanilla_p_train.add_operation(CropPercentageRange(probability=1, min_percentage_area=0.8, max_percentage_area=1, centre=False))
# # Rotate an image by either 90, 180, or 270 degrees randomly
# vanilla_p_train.rotate_random_90(probability=0.5)
# # Resize an image
# vanilla_p_train.resize(probability=1, width=input_size[0], height=input_size[1])
# # Flip the image along its vertical axis
# vanilla_p_train.flip_top_bottom(probability=0.5)
# # Flip the image along its horizontal axis
# vanilla_p_train.flip_left_right(probability=0.5)
# # Random change brightness of an image
# vanilla_p_train.random_brightness(probability=1, min_factor=0.9, max_factor=1.1)
# # Random change saturation of an image
# vanilla_p_train.random_color(probability=1, min_factor=0.9, max_factor=1.1)
# # Set the seed
# vanilla_p_train.set_seed(seed)

# generator_train = vanilla_p_train.keras_generator(batch_size=batch_size, scaled=True, image_data_format=K.image_data_format())
# # vanilla_p_train.status()

# ### Validation Data Generator
# vanilla_p_val = DataFramePipeline(source_dataframe=df_val, image_col='path', category_col='category')
# # Center crop
# vanilla_p_val.crop_centre(probability=1, percentage_area=0.9)
# # Resize an image
# vanilla_p_val.resize(probability=1, width=input_size[0], height=input_size[1])
# # Set the seed
# vanilla_p_val.set_seed(seed)

# generator_val = vanilla_p_val.keras_generator(batch_size=batch_size, scaled=True, image_data_format=K.image_data_format())
# # vanilla_p_val.status()

# # datagen_train = ImageDataGenerator(rescale=1./255)
# # generator_train = datagen_train.flow_from_dataframe(
# #     dataframe=df_train, x_col='path', y_col='category', class_mode='categorical', target_size=input_size, batch_size=batch_size, seed=seed)

# # datagen_val = ImageDataGenerator(rescale=1./255)
# # generator_val = datagen_val.flow_from_dataframe(
# #     dataframe=df_val, x_col='path', y_col='category', class_mode='categorical', target_size=input_size, batch_size=batch_size, seed=seed)

In [None]:
# # Print some info for debugging
# images, labels = next(generator_train)
# # print(len(vanilla_p_train.augmentor_images))
# print(images.shape)
# print(labels.shape)
# plt.imshow(images[0]);

### CNN Model

In [None]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Flatten, Dense, Reshape
from keras.models import Sequential
from keras.optimizers import Adam

lr_start = 1e-3 # Starting learning rate

# Define vanilla CNN
model = Sequential()

model.add(Conv2D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(input_size[0], input_size[1], 3)))
model.add(MaxPooling2D(pool_size=2))

model.add(Conv2D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

model.add(Conv2D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling2D(pool_size=2))

model.add(Dropout(rate=0.3))
model.add(GlobalAveragePooling2D())
model.add(Dense(known_category_num, activation='softmax'))

model.summary()

# Compile the model
model.compile(optimizer=Adam(lr=lr_start), loss='categorical_crossentropy', metrics=[balanced_accuracy, 'accuracy'])

In [None]:
# # Dispaly images of a batch for debugging
# images, labels = next(generator_train)

# fig = plt.figure(figsize=(20,10))

# for i, img in enumerate(images):
#     ax = fig.add_subplot(4, 10, i + 1, xticks=[], yticks=[])
#     ax.imshow(np.uint8(255 * img))
#     image_idx = np.argmax(labels[i])
#     ax.set(title=category_names[image_idx])

### Train the vanilla CNN

In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger

if not os.path.exists('saved_models'):
    os.makedirs('saved_models')

checkpoint_balanced_acc = ModelCheckpoint(
    filepath='saved_models/vanilla_best_balanced_acc.hdf5',
    monitor='val_balanced_accuracy',
    verbose=1,
    save_best_only=True)

checkpoint_loss = ModelCheckpoint(
    filepath='saved_models/vanilla_best_loss.hdf5',
    monitor='val_loss',
    verbose=1,
    save_best_only=True)

# Reduce learning rate when the validation loss has stopped improving.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-5, verbose=1)

# Stop training when the validation loss has stopped improving.
early_stop = EarlyStopping(monitor='val_loss', patience=22, verbose=1)

# Callback that streams epoch results to a csv file.
if not os.path.exists('logs'):
    os.makedirs('logs')

csv_logger = CSVLogger('logs/vanilla.training.csv', append=True)

epoch_num = 100

history = model.fit_generator(
    generator_train,
    class_weight=class_weight_dict,
    max_queue_size=10,
    workers=2,
    use_multiprocessing=False,
    steps_per_epoch=sample_count_train//batch_size,
    epochs=epoch_num,
    verbose=1,
    callbacks=[checkpoint_balanced_acc, checkpoint_loss,
               reduce_lr, early_stop, csv_logger],
    validation_data=generator_val,
    validation_steps=sample_count_val//batch_size)

### Model Complexity Graph

In [None]:
from visuals import *

plot_complexity_graph('logs/vanilla.training.csv')

### Load the Model with the Best Balanced Accuracy

In [None]:
from keras.models import load_model

vanilla_model = load_model(
    filepath='saved_models/vanilla_best_balanced_acc.hdf5',
    custom_objects={'balanced_accuracy': balanced_accuracy})
# vanilla_model.summary()

### Classify Dermoscopic Images with the Vanilla CNN

In [None]:
import random

def vanilla_classify(img_path, topk=5):
    predicted_vector = vanilla_model.predict(path_to_tensor(img_path))
    idx_topk = np.argsort(-predicted_vector)[0, :topk]
    probs = np.take(predicted_vector, idx_topk)
    names = [category_names[idx] for idx in idx_topk]
    
    return idx_topk, names, probs

topk = 8
df_row = df_val.iloc[random.randrange(len(df_val.index))]
idx_topk, names, probs = vanilla_classify(df_row['path'], topk=topk)
# print(probs)

# Set up plot
fig, (ax1, ax2) = plt.subplots(figsize=(10, 4), ncols=2)
fig.patch.set_facecolor('white')

# Set up title
fig.suptitle(df_row['image'])

# Input Image
ax1.set_title(category_names[df_row['category']])
ax1.imshow(plt.imread(df_row['path']))

# Plot probabilities bar chart
ax2.set_title("Top {0} probabilities".format(topk))
ax2.barh(np.arange(topk), probs)
ax2.set_aspect(0.1)
ax2.set_yticks(np.arange(topk))
ax2.set_yticklabels(names, size='medium')
ax2.yaxis.tick_right()
ax2.set_xlim(0, 1.0)
ax2.invert_yaxis()

## Transfer Learning

### Image Augmentation Pipeline for Transfer Learning

In [None]:
from transfer_learning import build_aug_pipeline

p_train, p_val = build_aug_pipeline(
#     train_dataframe=df_train, val_dataframe=df_val,
#     image_col='path', category_col='category',
    input_size=(224, 224), seed=seed
)

print('Training Augmentation:')
p_train.status()
print('Validation Augmentation:')
p_val.status()

In [None]:
from transfer_learning import build_finetune_model
from keras.applications.densenet import DenseNet201, preprocess_input as preprocess_input_densenet

batch_size = 40
lr_start = 1e-3 # Starting learning rate
preprocessing_func = preprocess_input_densenet
data_format = K.image_data_format()

generator_train = ImageIterator(
    image_paths=df_train['path'].tolist(),
    labels=np_utils.to_categorical(df_train['category'], num_classes=known_category_num),
    augmentation_pipeline=p_train,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    preprocessing_function=preprocessing_func,
    data_format=data_format
)

generator_val = ImageIterator(
    image_paths=df_val['path'].tolist(),
    labels=np_utils.to_categorical(df_val['category'], num_classes=known_category_num),
    augmentation_pipeline=p_val,
    batch_size=batch_size,
    shuffle=True,
    seed=seed,
    preprocessing_function=preprocessing_func,
    data_format=data_format
)

base_model = DenseNet201(include_top=False, weights='imagenet')
base_model_name = 'DenseNet201'
fc_layers = [512]
dropout = 0.3

densenet_model = build_finetune_model(
    base_model,
    fc_layers=fc_layers,
    dropout=dropout,
    num_classes=known_category_num,
    base_model_layers_trainable=False)

densenet_model.summary()

# Compile the model
densenet_model.compile(optimizer=Adam(lr=lr_start), loss='categorical_crossentropy', metrics=[balanced_accuracy, 'accuracy'])

In [None]:
# # Print some info for debugging
# images, labels = next(generator_train)
# print(images.shape)
# # print(images[0])
# # plt.imshow(images[0]);

### Training

In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping, CSVLogger

if not os.path.exists('saved_models'):
    os.makedirs('saved_models')
    
if not os.path.exists('logs'):
    os.makedirs('logs')

checkpoint_balanced_acc = ModelCheckpoint(
    filepath='saved_models/{}_best_balanced_acc.hdf5'.format(base_model_name),
    monitor='val_balanced_accuracy',
    verbose=1,
    save_best_only=True)

checkpoint_loss = ModelCheckpoint(
    filepath='saved_models/{}_best_loss.hdf5'.format(base_model_name),
    monitor='val_loss',
    verbose=1,
    save_best_only=True)

# Reduce learning rate when the validation loss has stopped improving.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=1e-5, verbose=1)

# Stop training when the validation loss has stopped improving.
early_stop = EarlyStopping(monitor='val_loss', patience=22, verbose=1)

# Callback that streams epoch results to a csv file.
csv_logger = CSVLogger('logs/{}.training.csv'.format(base_model_name), append=True)

epoch_num = 125

history = densenet_model.fit_generator(
    generator_train,
    class_weight=class_weight_dict,
    max_queue_size=10,
    workers=2,
    use_multiprocessing=False,
    steps_per_epoch=sample_count_train//batch_size,
    epochs=epoch_num,
    verbose=1,
    callbacks=[checkpoint_balanced_acc, checkpoint_loss,
               reduce_lr, early_stop, csv_logger],
    validation_data=generator_val,
    validation_steps=sample_count_val//batch_size)

In [None]:
from visuals import *

plot_complexity_graph('logs/{}.training.csv'.format(base_model_name))