# Skin Lesion Classifier

## Google Colab

In [None]:
# Run this cell to mount Google Drive for Colab
from google.colab import drive
drive.mount('/content/drive/')
# !ls '/content/drive/My Drive/Colab Notebooks'

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/isic-2019')

In [None]:
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_Input.zip' '/home/ISIC_2019_Training_Input.zip'
# !cp '/content/drive/My Drive/Colab Notebooks/ISIC_2019_Training_GroundTruth.csv' '/home/ISIC_2019_Training_GroundTruth.csv'
# !unzip -qq '/home/ISIC_2019_Training_Input.zip' -d '/home'

In [None]:
# # Ref https://docs.fast.ai/performance.html
# !pip uninstall -y pillow pil jpeg libtiff libjpeg-turbo
# !CFLAGS="${CFLAGS} -mavx2" pip install --upgrade --no-cache-dir --force-reinstall --no-binary :all: --compile pillow-simd

## Environment

### Install Python Packages

In [None]:
# !pip3 install -r requirements.txt

### Check whether you’re running Pillow or Pillow-SIMD?

In [None]:
# According to the author, if PILLOW_VERSION has a postfix, it is Pillow-SIMD0.
# (Assuming that Pillow will never make a .postX release).
!python -c "from PIL import Image; print(Image.PILLOW_VERSION)"

### Whether Pillow or Pillow-SIMD is using libjpeg-turbo?

In [None]:
from PIL import features, Image
from packaging import version

if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"):
    if features.check_feature('libjpeg_turbo'):
        print("libjpeg-turbo is on")
    else:
        print("libjpeg-turbo is not on")
else:
    print("libjpeg-turbo' status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {}".format(Image.PILLOW_VERSION))

### Confirm TensorFlow can see the GPU

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print("Found GPU at: {}".format(device_name))

### System Information

In [None]:
import tensorflow as tf
import platform
from tensorflow.python.client import device_lib

!python3 --version

print('\nTensorFlow Version: ', tf.VERSION)

print('\nNVIDIA:')
!nvcc --version
# !nvidia-smi

print('\nCPU:')
!lscpu

print('\nMemory:')
!cat /proc/meminfo

print('\nOS:')
print(platform.platform())

print('\nDevices:')
print(device_lib.list_local_devices())

## Import Training Data

In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import matplotlib.pyplot as plt
from data import load_isic_data
%load_ext autoreload
%autoreload 2
%matplotlib inline

# dermoscopic images folder path
data_folder = 'C:\ISIC_2019'
# data_folder = '/home'
# data_folder = '/home/jupyter'
derm_image_folder = os.path.join(data_folder, 'ISIC_2019_Training_Input')
ground_truth_file = os.path.join(data_folder, 'ISIC_2019_Training_GroundTruth.csv')

df_ground_truth, category_names = load_isic_data(derm_image_folder, ground_truth_file)
known_category_num = len(category_names)
print("Number of known categories: {}".format(known_category_num))
print(category_names, '\n')

# mapping from category to index
print('Category to Index:')
category_to_index = dict((c, i) for i, c in enumerate(category_names))
print(category_to_index, '\n')

count_per_category = Counter(df_ground_truth['category'])
total_sample_count = sum(count_per_category.values())
print("Original training data has {} samples.".format(total_sample_count))

for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category[i], count_per_category[i]*100/total_sample_count))

# fig = plt.bar(count_per_category.keys(), count_per_category.values())
fig = plt.bar(category_names, [count_per_category[i] for i in range(len(category_names))])

df_ground_truth.head()

### Shuffle and Split Original Training Data into Training  and Validation Sets

In [None]:
from data import train_validation_split

df_train, df_val = train_validation_split(df_ground_truth)

sample_count_train = df_train.shape[0]
print("Training set has {} samples.".format(sample_count_train))
count_per_category_train = Counter(df_train['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_train[i], count_per_category_train[i]*100/sample_count_train))
    
sample_count_val = df_val.shape[0]
print("\nValidation set has {} samples.".format(sample_count_val))
count_per_category_val = Counter(df_val['category'])
for i, c in enumerate(category_names):
    print("'%s':\t%d\t(%.2f%%)" % (c, count_per_category_val[i], count_per_category_val[i]*100/sample_count_val))

### Class Weights based on the Traning Set

In [None]:
from data import compute_class_weight_dict

class_weight_dict = compute_class_weight_dict(df_train)
print('Class Weights Dictionary:')
print(class_weight_dict)

### Samples of each Category

In [None]:
from IPython.display import Image

category_groups = df_train.groupby('category')

# Number of samples for each category
num_per_category = 3

fig, axes = plt.subplots(nrows=known_category_num, ncols=num_per_category, figsize=(9, 24))
plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
fig.patch.set_facecolor('white')

for idx, val in enumerate(category_names):
    i = 0
    for index, row in category_groups.get_group(idx).head(num_per_category).iterrows():
        ax = axes[idx, i]
        ax.imshow(plt.imread(row['path']))
        ax.set_xlabel(row['image'])
        if ax.is_first_col():
            ax.set_ylabel(val, fontsize=20)
            ax.yaxis.label.set_color('blue')
        i += 1
    
fig.tight_layout()

## Create a Vanilla CNN as Benchmark Model

### Train the Vanilla CNN

In [None]:
input_size = (224, 224)
batch_size = 40
rescale=1./255
workers = os.cpu_count()
epoch_num = 125

In [None]:
from main import train_vanilla

train_vanilla(df_train, df_val, known_category_num, class_weight_dict, batch_size, epoch_num)

### Model Complexity Graph

In [None]:
from visuals import *

plot_complexity_graph("logs/{}.training.csv".format(classifier.model_name))

### Load Model with Best Balanced Accuracy

In [None]:
from keras.models import load_model
from metrics import balanced_accuracy

model = load_model(filepath='saved_models/vanilla_best_balanced_acc.hdf5',
                   custom_objects={'balanced_accuracy': balanced_accuracy})
# model.summary()

### Compute Balanced Accuracy on all Validation Samples

In [None]:
import os
from sklearn.metrics import balanced_accuracy_score, recall_score
from keras import backend as K
from keras.utils import np_utils
from Augmentor import Pipeline
from image_iterator import ImageIterator

p_val = Pipeline()
# Resize the image to the desired input size of the model
p_val.resize(probability=1, width=input_size[0], height=input_size[1])
        
generator = ImageIterator(
    image_paths=df_val['path'].tolist(),
    labels=np_utils.to_categorical(df_val['category'], num_classes=known_category_num),
    augmentation_pipeline=p_val,
    batch_size=batch_size,
    shuffle=False, #shuffle must be False otherwise will get a wrong balanced accuracy
    rescale=rescale,
    pregen_augmented_images=False, # Only 1 epoch.
    data_format=K.image_data_format()
)

# print(len(generator))
predicted_vector = model.predict_generator(generator, verbose=0, workers=workers)

y_true = df_val['category'].values
y_pred = np.argmax(predicted_vector, axis=1)

print('balanced_accuracy_score: ', balanced_accuracy_score(y_true, y_pred))
# print('macro recall_score: ', recall_score(y_true, y_pred, average='macro'))

### Classify Dermoscopic Images with the Vanilla CNN

In [None]:
import random
from utils import path_to_tensor

def vanilla_classify(img_path, topk=5):
    predicted_vector = model.predict(path_to_tensor(img_path))
    idx_topk = np.argsort(-predicted_vector)[0, :topk]
    probs = np.take(predicted_vector, idx_topk)
    names = [category_names[idx] for idx in idx_topk]
    
    return idx_topk, names, probs

topk = known_category_num
df_row = df_val.iloc[random.randrange(len(df_val.index))]
idx_topk, names, probs = vanilla_classify(df_row['path'], topk=topk)
# print(probs)

# Set up plot
fig, (ax1, ax2) = plt.subplots(figsize=(10, 4), ncols=2)
fig.patch.set_facecolor('white')

# Set up title
fig.suptitle(df_row['image'])

# Input Image
ax1.set_title(category_names[df_row['category']])
ax1.imshow(plt.imread(df_row['path']))

# Plot probabilities bar chart
ax2.set_title("Top {0} probabilities".format(topk))
ax2.barh(np.arange(topk), probs)
ax2.set_aspect(0.1)
ax2.set_yticks(np.arange(topk))
ax2.set_yticklabels(names, size='medium')
ax2.yaxis.tick_right()
ax2.set_xlim(0, 1.0)
ax2.invert_yaxis()

## Transfer Learning

### Train Models by Transfer Learning

In [None]:
from main import train_transfer_learning

train_transfer_learning(df_train, df_val, known_category_num, class_weight_dict, batch_size, epoch_num)

### Complexity Graph of Transfer Learning Models

In [None]:
from visuals import *

model_names = ['DenseNet201', 'Xception', 'NASNetLarge']
for model_name in model_names:
    plot_complexity_graph("logs/{}.training.csv".format(model_name))