<a href="https://colab.research.google.com/github/richmondvan/melanoma-detection/blob/master/process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP

**This cell is mandatory!**

- Imports modules

- Mounts Google Drive

- Sets up constants

In [30]:
# Must be run every time!
import numpy as np
import tensorflow as tf

from google.colab import drive 
drive.mount('/content/gdrive') 

DATASET_FILEPATH = "/content/gdrive/My Drive/Dataset/dataset.zip"
DIRECTORY_PATH = "/content/gdrive/My Drive/Dataset/"

Mounted at /content/gdrive


One-time:
Download my data from ISIC

In [0]:
import urllib

urllib.request.urlretrieve("https://isic-archive.com/api/v1/image/download?include=all&filter={%22operator%22:%22and%22,%22operands%22:[{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.datasetId%22,%22type%22:%22objectid%22},[%225a2ecc5e1165975c945942a4%22,%225a2ecc5e1165975c945942a2%22,%225a2ecc5d1165975c94594292%22,%225a2ecc5d1165975c9459428e%22,%225a2ecc5d1165975c94594284%22,%225aaf6f2a116597691367292e%22,%225a2ecc5d1165975c9459427e%22,%225a2ecc5d1165975c9459428a%22]]},{%22operator%22:%22and%22,%22operands%22:[{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.clinical.benign_malignant%22,%22type%22:%22string%22},[%22benign%22,%22malignant%22]]},{%22operator%22:%22in%22,%22operands%22:[{%22identifier%22:%22meta.tags%22,%22type%22:%22string%22},[%22Challenge%202019:%20Training%22]]}]}]}", DATASET_FILEPATH)

One-time: Extract data from archive

In [0]:
from zipfile import ZipFile

with ZipFile(DATASET_FILEPATH, 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall(DIRECTORY_PATH)

# More Setup

One-time: Converts CSV file into a dictionary for classification lookups

In [0]:
import csv

METADATA_PATH = DIRECTORY_PATH + "ISIC-images/metadata.csv"

with open(METADATA_PATH, mode='r') as infile:
    reader = csv.DictReader(infile)
    GROUND_TRUTH_DICT = {row['name'] : row['meta.clinical.benign_malignant'] for row in reader} 

print(GROUND_TRUTH_DICT)

One-time: Sort images into proper directories

In [0]:
import shutil
import os
import pathlib

IMAGE_PATH = pathlib.Path(DIRECTORY_PATH + "ISIC-images/")
BENIGN_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/benign/"
MALIGNANT_PATH = "/content/gdrive/My Drive/Dataset/DatasetSorted/malignant/"

pathlist = pathlib.Path(IMAGE_PATH).glob("*/*.jpg")
for path in pathlist:
    fileName = path.name
    key = GROUND_TRUTH_DICT[path.name.strip(".jpg")]
    if key == "benign":
        pastePath = BENIGN_PATH + path.name
    elif key == "malignant":
        pastePath = MALIGNANT_PATH + path.name
    else:
        print("error")
    
    copyPath = str(path)

    shutil.move(copyPath, pastePath)

**Mandatory!**

Set up some functions before we load the dataset

In [0]:
def get_label(file_path):
    # convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    return parts[-2] == CLASS_NAMES

def decode_img(img):
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_jpeg(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    return tf.image.resize_with_pad(img, 224, 224)

def process_path(file_path):
    label = get_label(file_path)
    # load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label

def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
    # This is a small dataset, only load it once, and keep it in memory.
    # use `.cache(filename)` to cache preprocessing work for datasets that don't
    # fit in memory.

    global AUTOTUNE
    
    if cache:
        if isinstance(cache, str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()

    ds = ds.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat forever
    ds = ds.repeat()

    ds = ds.batch(BATCH_SIZE)

    # `prefetch` lets the dataset fetch batches in the background while the model
    # is training.
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

**Mandatory!**

Prepare dataset for loading

In [0]:
IMAGE_PATH = pathlib.Path("/content/gdrive/My Drive/Dataset/DatasetSorted/")
IMAGE_COUNT = len(list(IMAGE_PATH.glob("*/*.jpg")))

list_ds = tf.data.Dataset.list_files(str(IMAGE_PATH/"*/*.jpg"))

AUTOTUNE = tf.data.experimental.AUTOTUNE

labeled_ds = list_ds.map(process_path, num_parallel_calls=AUTOTUNE)

BATCH_SIZE = 32

labeled_ds = labeled_ds.shuffle(IMAGE_COUNT)

train_size = int(0.6 * IMAGE_COUNT)
valid_size = int(0.2 * IMAGE_COUNT)

train_ds = labeled_ds.take(train_size)
test_ds = labeled_ds.skip(train_size)
valid_ds = test_ds.take(valid_size)
test_ds = test_ds.skip(valid_size)

train_ds = prepare_for_training(train_ds, shuffle_buffer_size=train_size)