<a href="https://colab.research.google.com/github/richmondvan/melanoma-detection/blob/master/process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP

**This cell is mandatory!**

- Imports modules

- Mounts Google Drive

- Sets up constants

In [0]:
# Must be run every time!
import numpy as np
import tensorflow as tf
import pathlib
from tensorflow.keras import models, layers, losses
import math
from google.colab import drive 
import os

drive.mount('/content/gdrive') 

DATASET_FILEPATH = "/content/gdrive/My Drive/Dataset/dataset.zip"
DIRECTORY_PATH = "/content/gdrive/My Drive/Dataset/"

In [0]:
print(tf.__version__)

In [0]:
!pip install tf-nightly

One-time:
Download my data from ISIC

In [0]:
import urllib

urllib.request.urlretrieve("https://isic-archive.com/api/v1/image/download?include=all&filter={%22operator%22:%22and%22,%22operands%22:[{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.datasetId%22,%22type%22:%22objectid%22},[%225a2ecc5e1165975c945942a4%22,%225a2ecc5e1165975c945942a2%22,%225a2ecc5d1165975c94594292%22,%225a2ecc5d1165975c9459428e%22,%225a2ecc5d1165975c94594284%22,%225aaf6f2a116597691367292e%22,%225a2ecc5d1165975c9459427e%22,%225a2ecc5d1165975c9459428a%22]]},{%22operator%22:%22and%22,%22operands%22:[{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.clinical.benign_malignant%22,%22type%22:%22string%22},[%22benign%22,%22malignant%22]]},{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.tags%22,%22type%22:%22string%22},[%22Challenge%202019:%20Training%22]]}]}]}", DATASET_FILEPATH)

One-time: Extract data from archive

In [0]:
from zipfile import ZipFile

with ZipFile(DATASET_FILEPATH, 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall(DIRECTORY_PATH)

# More Setup

One-time: Converts CSV file into a dictionary for classification lookups

In [0]:
import csv

METADATA_PATH = DIRECTORY_PATH + "ISIC-images/metadata.csv"

with open(METADATA_PATH, mode='r') as infile:
    reader = csv.DictReader(infile)
    GROUND_TRUTH_DICT = {row['name'] : row['meta.clinical.benign_malignant'] for row in reader} 

print(GROUND_TRUTH_DICT)

One-time: Sort images into proper directories

In [0]:
import shutil
import os

IMAGE_PATH = pathlib.Path(DIRECTORY_PATH + "ISIC-images/")
BENIGN_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/benign/"
MALIGNANT_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/malignant/"

pathlist = pathlib.Path(IMAGE_PATH).glob("*/*.jpg")
for path in pathlist:
    fileName = path.name
    key = GROUND_TRUTH_DICT[path.name.strip(".jpg")]
    if key == "benign":
        pastePath = BENIGN_PATH + path.name
    elif key == "malignant":
        pastePath = MALIGNANT_PATH + path.name
    else:
        print("error")
    
    copyPath = str(path)

    shutil.move(copyPath, pastePath)

Sort images into training, validation, test sets


In [0]:
import shutil
import os
from random import shuffle

BENIGN_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/benign/"
MALIGNANT_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/malignant/"

TRAINING_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/training/"
VALIDATION_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/validation/"
TEST_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/test/"

benignPathList = pathlib.Path(BENIGN_PATH).glob("*.jpg")

counter = 0
for path in benignPathList:
    name = path.name
    copyPath = str(path)
    counter += 1
    key = counter % 5
    print(key)
    if key == 3:
        pastePath = VALIDATION_PATH + "benign/" + name
    elif key == 4:
        pastePath = TEST_PATH + "benign/" + name
    else:
        pastePath = TRAINING_PATH + "benign/" + name
    
    shutil.move(copyPath, pastePath)

malignantPathList = pathlib.Path(MALIGNANT_PATH).glob("*.jpg")

counter = 0
for path in malignantPathList:
    name = path.name
    copyPath = str(path)
    counter += 1
    key = counter % 5
    print(key)
    if key == 3:
        pastePath = VALIDATION_PATH + "malignant/" + name
    elif key == 4:
        pastePath = TEST_PATH + "malignant/" + name
    else:
        pastePath = TRAINING_PATH + "malignant/" + name
    
    shutil.move(copyPath, pastePath)


**Mandatory!**

Sets up dataset


In [0]:
def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    return (parts[-2] == PRESENCE)

def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    return tf.image.resize_with_pad(img, 224, 224)

def process_path(file_path):
    label = get_label(file_path)


    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

def process_path_raw(file_path):
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img

def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.
    
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat forever
    ds = ds.batch(BATCH_SIZE)
    ds = ds.repeat()

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [0]:
TRAINING_PATH = pathlib.Path("/content/gdrive/My Drive/Dataset/DatasetSorted/training/")
VALIDATION_PATH = pathlib.Path("/content/gdrive/My Drive/Dataset/DatasetSorted/validation/")
TEST_PATH = pathlib.Path("/content/gdrive/My Drive/Dataset/DatasetSorted/test/")

PRESENCE = np.array(["malignant"])
BATCH_SIZE = 32

AUTOTUNE = tf.data.experimental.AUTOTUNE
TRAIN_LEN = len(list(TRAINING_PATH.glob("*/*.jpg")))
VALID_LEN = len(list(VALIDATION_PATH.glob("*/*.jpg")))

trainXRaw = tf.data.Dataset.list_files(str(TRAINING_PATH) + "/*/*.jpg")
trainX = trainXRaw.map(process_path, num_parallel_calls=AUTOTUNE)
trainXRaw = trainXRaw.map(process_path_raw, num_parallel_calls=AUTOTUNE)

validationXRaw = tf.data.Dataset.list_files(str(VALIDATION_PATH) + "/*/*.jpg")
validationX = validationXRaw.map(process_path, num_parallel_calls=AUTOTUNE)


trainXRaw = prepare_for_training(trainXRaw, shuffle_buffer_size=TRAIN_LEN)
trainX = prepare_for_training(trainX, shuffle_buffer_size=TRAIN_LEN)
validationX = prepare_for_training(validationX, shuffle_buffer_size=VALID_LEN)

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
import tensorflow_hub as hub

feature_extractor_url = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/2"


feature_extractor_layer = hub.KerasLayer(feature_extractor_url,
                                        input_shape=(224,224,3))

feature_extractor_layer.trainable = False

model = models.Sequential([
    feature_extractor_layer,
    layers.Dense(16),
    layers.Dense(16),
    layers.Dense(16),
    layers.Dense(16),
    layers.Dense(1)
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['acc'])
model.summary()






In [0]:
history = model.fit(x=trainX, validation_data=validationX)

old code do not touch, do not run


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator

BENIGN_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/benign/"
MALIGNANT_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/malignant/"

TRAINING_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/training/"
VALIDATION_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/validation/"
TEST_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/test/"

train_image_generator = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True) # Generator for our training data
validation_image_generator = ImageDataGenerator(rescale=1./255) # Generator for our validation data

batch_size = 128
epochs = 15
IMG_HEIGHT = 150
IMG_WIDTH = 150

CLASS_NAMES = ['benign', 'malignant']

train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
                                                           directory=TRAINING_PATH,
                                                           shuffle=True,
                                                           target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                           class_mode='binary',
                                                           classes=CLASS_NAMES)

val_data_gen = validation_image_generator.flow_from_directory(batch_size=batch_size,
                                                              directory=VALIDATION_PATH,
                                                              shuffle=True,
                                                              target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                              class_mode='binary',
                                                              classes=CLASS_NAMES)

**Mandatory!**

Prepare model

In [0]:
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    model = models.Sequential([
        layers.Conv2D(32, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        layers.Conv2D(32, 3, padding='same', activation='relu'),
        layers.MaxPooling2D(),
        layers.Conv2D(16, 3, padding='valid', activation='relu'),
        layers.Conv2D(16, 3, padding='valid', activation='relu'),
        layers.MaxPooling2D(),
        layers.MaxPooling2D(),
        layers.Flatten(),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(16, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss=losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

    model.summary()

    for x in range(5):
        history = model.fit(x=train_data_gen, epochs=1, verbose=1, validation_data=val_data_gen, validation_steps=1000 // batch_size, steps_per_epoch=5000 // batch_size)
        model.save_weights('/content/gdrive/My Drive/Dataset/weights.h5')