In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, recall_score
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense,GlobalAveragePooling2D, Flatten
from keras.applications.vgg16 import VGG16
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger

# function to filter for only desired categories
def get_categories(df, col, categories):
    return df[df[col].isin(categories)]

# function to change to binary label is cancer/not cancer
def cancer_labeling(dx):
    if dx == "akiec" or dx == 'mel' or dx == 'bcc':
        return 'cancer'
    else:
        return 'not cancer'
    
# function to change to binary label is melanoma/not melanoma
def melanoma_labeling(dx):
    if dx == 'mel':
        return '1-melanoma'
    else:
        return '0-not-melanoma'

In [61]:
meta = pd.read_csv('skin-cancer-mnist-ham10000/HAM10000_metadata.csv')

# filter images dataset for desired categories
categories = ['bkl', 'nv', 'mel']
meta_filtered = get_categories(meta, 'dx', categories)

# append jpg to image id
meta_filtered['image_id'] = meta_filtered['image_id'].transform(lambda x: x + '.jpg')

meta_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419.jpg,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030.jpg,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769.jpg,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661.jpg,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633.jpg,bkl,histo,75.0,male,ear


In [62]:
# Get subset
meta_filtered.sort_values('dx')

mel = meta_filtered[meta_filtered['dx'] == 'mel'].sample(500)
bkl = meta_filtered[meta_filtered['dx'] == 'bkl'].sample(500)
nv = meta_filtered[meta_filtered['dx'] == 'nv'].sample(1000)

df = mel.append([bkl, nv])

df.groupby('dx').count()

Unnamed: 0_level_0,lesion_id,image_id,dx_type,age,sex,localization
dx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bkl,10,10,10,10,10,10
mel,10,10,10,10,10,10
nv,20,20,20,20,20,20


In [63]:
# Oversample mel to balance classes
# try with class weights, different oversampling rates
# df = df.append(mel)
# df.groupby('dx').count()

# try without class weights

In [64]:
# split data into train, test, validate
X_train, X_test, y_train, y_test = train_test_split(
    df['image_id'], 
    df['dx'], 
    test_size=0.2, 
    random_state=1, 
    stratify=df['dx']
)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, 
#     y_train, 
#     test_size=0.2, 
#     random_state=1,
#     stratify=y_train
# )

train = pd.DataFrame(X_train)
train['label'] = y_train.apply(lambda x: melanoma_labeling(x))
test = pd.DataFrame(X_test)
test['label'] = y_test.apply(lambda x: melanoma_labeling(x))

print(train.groupby('label').count())
print(test.groupby('label').count())


                image_id
label                   
0-not-melanoma        24
1-melanoma             8
                image_id
label                   
0-not-melanoma         6
1-melanoma             2


In [65]:
# Create image generator classes
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    rotation_range=45,
    horizontal_flip=True,
    vertical_flip=True
)
test_datagen = ImageDataGenerator(rescale=1./255)

# Import image data
img_dir = 'skin-cancer-mnist-ham10000/HAM10000/'
img_resize = (128, 128)

train_generator = train_datagen.flow_from_dataframe(
    train, 
    directory=img_dir, 
    x_col='image_id', 
    y_col='label', 
    target_size=img_resize,
    class_mode='binary',
    batch_size=25,
    drop_duplicates=False
)

# val_generator = train_datagen.flow_from_dataframe(
#     val, 
#     directory=img_dir, 
#     x_col='image_id', 
#     y_col='melanoma', 
#     target_size=img_resize,
#     class_mode='binary',
#     batch_size=25
# )

test_generator = test_datagen.flow_from_dataframe(
    test, 
    directory=img_dir, 
    x_col='image_id', 
    y_col='label', 
    target_size=img_resize,
    class_mode='binary',
    batch_size=25,
    drop_duplicates=False
)

train_steps = train_generator.n//train_generator.batch_size
test_steps = test_generator.n//test_generator.batch_size

# print(train_generator.class_indices)
# print(test_generator.class_indices)
# print(val_generator.class_indices)


Found 32 images belonging to 2 classes.
Found 8 images belonging to 2 classes.


In [66]:
# Save model checkpoints
checkpointer = ModelCheckpoint('saved_model/VGG16.hdf5', verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='acc', 
                              factor=0.5, 
                              patience=2, 
                              verbose=1, 
                              mode='max', 
                              min_lr=0.00002)

class_weights = class_weight.compute_class_weight(
           'balanced',
            np.unique(train_generator.classes), 
            train_generator.classes)
class_weights

array([0.66666667, 2.        ])

In [67]:
def build_model(optimizer, unfrozen_layers, epochs):
    # Build base model
    base_model = VGG16(weights='imagenet',include_top=False, input_shape=(128,128,3))
    
    for layer in base_model.layers:
        layer.trainable = False
    if unfrozen_layers == None:
        base_model.trainable = False
    else:
        base_model.trainable = True
        for layer in unfrozen_layers:
            base_model.layers[layer].trainable = True
            print(base_model.layers[layer].name, base_model.layers[layer].trainable)
            
    # Visusalize which layers are trainable         
#     layers = [(layer, layer.name, layer.trainable) for layer in base_model.layers]
#     layers_df = pd.DataFrame(layers, columns=['Layer', 'Layer Name', 'Layer Trainable'])
#     with open('base_model_layers.csv', 'a'):
#             layers_df.to_csv('base_model_layers.csv', mode='a')
            
    # Build model
    model = Sequential()
    model.add(base_model)
    model.add(GlobalAveragePooling2D())
    # Add dense layers so that the model can learn more complex functions and classify for better results.
    model.add(Dense(1024,activation='relu'))
    model.add(Dense(512,activation='relu'))
    model.add(Dense(256,activation='relu'))

    # Final layer with sigmoid activation
    model.add(Dense(1,activation='sigmoid'))

    print(model.summary())

    # Train Model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'],)
    history = model.fit_generator(
        generator=train_generator,
        steps_per_epoch=train_generator.batch_size,
        epochs=epochs,
        verbose=1,
        callbacks=[checkpointer, reduce_lr],
        class_weight=class_weights
    )

    # Make a prediction
    predictions = model.predict_generator(test_generator, steps=test_steps, verbose=1)
    
    return predictions, history


In [68]:
def scan_optimizers(optimizers):
    for optimizer in optimizers:
        print(optimizer)
        # Import base model
        predictions, model_history = build_model(optimizer, unfrozen_layers, epochs)
#         predictions_classes = [0 if x >= 0.5 else 1 for x in predictions.flatten()]
        
        # Measure ROC/AUC
        # fpr = false positive rate; tpr = true positive rate; at various thresholds
        fpr, tpr, thresholds = roc_curve(test_generator.classes, predictions) 
        roc_auc = roc_auc_score(test_generator.classes, predictions)

        # Plot ROC/AUC
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label='area = {:.3f}'.format(roc_auc))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()

        # Write to csv file
        history = pd.DataFrame.from_dict(model_history.history)
        history['auc'] = roc_auc
        history['epochs'] = epochs
        history['class_weights'] = weights_on
        history['adjusted_lr'] = adjusted_lr_on
        history['optimizer'] = optimizer
        history['pretrain'] = pretrain_on
        history['oversample'] = oversample
        history['unfrozen_layers'] = unfrozen_layers
        history.head

        with open('vgg16_log.csv', 'a'):
            history.to_csv('vgg16_log.csv', mode='a')


In [69]:
# Iterate through layers, unfreezing them for training

def scan_layers(unfrozen_layers, ref_layers):
    # Import base model
    predictions, model_history = build_model(optimizer, unfrozen_layers, epochs)        

#         # Import base model
#         base_model = VGG16(weights='imagenet',include_top=False, input_shape=(128,128,3))

#         layers = [(layer, layer.name, layer.trainable) for layer in base_model.layers]
#         layers_df = pd.DataFrame(layers, columns=['Layer', 'Layer Name', 'Layer Trainable'])
#         layers_df.head()

#         predictions, model_history = build_model(base_model, optimizer, [i], epochs)
#         predictions_classes = [0 if x >= 0.5 else 1 for x in predictions.flatten()]

    # Measure ROC/AUC
    # fpr = false positive rate; tpr = true positive rate; at various thresholds
    fpr, tpr, thresholds = roc_curve(test_generator.classes, predictions) 
    roc_auc = roc_auc_score(test_generator.classes, predictions)

    # Plot ROC/AUC
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='area = {:.3f}'.format(roc_auc))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

    # Write to csv file
    history = pd.DataFrame.from_dict(model_history.history)
    history['auc'] = roc_auc
    history['epochs'] = epochs
    history['class_weights'] = weights_on
    history['adjusted_lr'] = adjusted_lr_on
    history['optimizer'] = optimizer
    history['pretrain'] = pretrain_on
    history['oversample'] = oversample
    layer_names = []
    for layer in unfrozen_layer:
        layer_names.append(ref_layers[layer].name)
        history['unfrozen_layer'] = layer_names
    history.head

    with open('vgg16_log.csv', 'a'):
        history.to_csv('vgg16_log.csv', mode='a')



In [70]:
# DONE Iterate through optimizers

# Varaints
# epochs = 10
# weights_on = False
# adjusted_lr_on = True
# pretrain_on = False
# oversample = False
# optimizer = ''
# unfrozen_layers = None

#optimizers =['SGD', 'RMSProp', 'Adagrad', 'Adadelta', 'Adam', 'Nadam']
# scan_optimizers(optimizers)

# Iterate through LAYERS
ref = VGG16(weights='imagenet',include_top=False, input_shape=(128,128,3))
LAYERS = len(ref.layers)
REF_LAYERS = ref.layers

# Varaints
epochs = 10
weights_on = False
adjusted_lr_on = True
pretrain_on = False
oversample = False
optimizer = 'RMSProp'
unfrozen_layers = ''

for i in range(1, LAYERS):
    unfrozen_layers = [i]
    scan_layers(unfrozen_layers, REF_LAYERS)



block1_conv1 True
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 4, 4, 512)         14714688  
_________________________________________________________________
global_average_pooling2d_12  (None, 512)               0         
_________________________________________________________________
dense_45 (Dense)             (None, 1024)              525312    
_________________________________________________________________
dense_46 (Dense)             (None, 512)               524800    
_________________________________________________________________
dense_47 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_48 (Dense)             (None, 1)                 257       
Total params: 15,896,385
Trainable params: 1,183,489
Non-trainable params: 14,712,896
______________________________________

KeyboardInterrupt: 

In [30]:
test.summary()
test.layers

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 128, 128, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 128, 128, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 128, 128, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 64, 64, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 64, 64, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 64, 64, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 32, 32, 128)       0         
__________

[<keras.engine.input_layer.InputLayer at 0x1a26125668>,
 <keras.layers.convolutional.Conv2D at 0x1a29a2db00>,
 <keras.layers.convolutional.Conv2D at 0x1a29a2df28>,
 <keras.layers.pooling.MaxPooling2D at 0x1a29a3c4a8>,
 <keras.layers.convolutional.Conv2D at 0x1a29a3cf98>,
 <keras.layers.convolutional.Conv2D at 0x1a26182438>,
 <keras.layers.pooling.MaxPooling2D at 0x1a2619f6a0>,
 <keras.layers.convolutional.Conv2D at 0x1a2619f518>,
 <keras.layers.convolutional.Conv2D at 0x1a31e64e48>,
 <keras.layers.convolutional.Conv2D at 0x1a31e92080>,
 <keras.layers.pooling.MaxPooling2D at 0x1a31f446a0>,
 <keras.layers.convolutional.Conv2D at 0x1a31f44518>,
 <keras.layers.convolutional.Conv2D at 0x1a31f62e48>,
 <keras.layers.convolutional.Conv2D at 0x1a31f92080>,
 <keras.layers.pooling.MaxPooling2D at 0x1a31fad6a0>,
 <keras.layers.convolutional.Conv2D at 0x1a31fad518>,
 <keras.layers.convolutional.Conv2D at 0x1a31fc9e48>,
 <keras.layers.convolutional.Conv2D at 0x1a31ff9080>,
 <keras.layers.pooling.Max

In [None]:
## Function for model tuning
# Unfreeze different layers
# Optimization functions
# Adjusted learning rate/not
# Write to csv file