# ISIC_2019: Train, Test, Validation Split

In [1]:
import tensorflow as tf
from skimage import io
import os
import cv2
from tqdm import tqdm
from glob import glob
# pip install -U albumentations --user
import warnings
warnings.filterwarnings("ignore")
import numpy as np

In [2]:
# Creating Directories to place the training, testing and validation images after performing train_test_split:
for clas in ['mel', 'nv', 'bcc', 'ak', 'bkl', 'df', 'vasc', 'scc']:
    os.makedirs(f'Data/train_70%/{clas}')
    os.makedirs(f'Data/Processed_Data/test/{clas}')
    os.makedirs(f'Data/Processed_Data/valid/{clas}')

In [2]:
# load the raw data from each class of the "train" folder:

dataset_path = "Data/train/" 

ak_images = sorted(glob(os.path.join(dataset_path+ "ak", "*.jpg")))
bcc_images = sorted(glob(os.path.join(dataset_path+ "bcc", "*.jpg")))
bkl_images = sorted(glob(os.path.join(dataset_path+ "bkl", "*.jpg")))
df_images = sorted(glob(os.path.join(dataset_path+ "df", "*.jpg")))
mel_images = sorted(glob(os.path.join(dataset_path+ "mel", "*.jpg")))
nv_images = sorted(glob(os.path.join(dataset_path+ "nv", "*.jpg")))
scc_images = sorted(glob(os.path.join(dataset_path+ "scc", "*.jpg")))
vasc_images = sorted(glob(os.path.join(dataset_path+ "vasc", "*.jpg")))

# Normalise the path of the data downloaded above

ak_images = [os.path.normpath(i) for i in ak_images]
bcc_images = [os.path.normpath(i) for i in bcc_images]
bkl_images = [os.path.normpath(i) for i in bkl_images]
df_images = [os.path.normpath(i) for i in df_images]
mel_images = [os.path.normpath(i) for i in mel_images]
nv_images = [os.path.normpath(i) for i in nv_images]
scc_images = [os.path.normpath(i) for i in scc_images]
vasc_images = [os.path.normpath(i) for i in vasc_images]


In [3]:
ak_images[0:3]

['Data\\train\\ak\\ISIC_0024468.jpg',
 'Data\\train\\ak\\ISIC_0024470.jpg',
 'Data\\train\\ak\\ISIC_0024511.jpg']

In [4]:
# Creating a function to shuffle the input images to maintain randomness of the shuffle. X refers to images path
# The random state has been set to 42 so that we get the same shuffling result when reproduced
from sklearn.utils import shuffle
def shuffling(X):
    X= shuffle(X, random_state=42)
    return X

In [5]:
ak_images = shuffling(ak_images)
bcc_images = shuffling(bcc_images)
bkl_images = shuffling(bkl_images)
df_images = shuffling(df_images)
mel_images = shuffling(mel_images)
nv_images = shuffling(nv_images)
scc_images = shuffling(scc_images)
vasc_images = shuffling(vasc_images)

In [6]:
# check for successful shuffling
ak_images[0:3]

['Data\\train\\ak\\ISIC_0059843.jpg',
 'Data\\train\\ak\\ISIC_0061582.jpg',
 'Data\\train\\ak\\ISIC_0028820.jpg']

In [13]:
# define a function to split all the image classes into train, test and valid:
# the split will be done as following: 70% of each class to train, 15% to test and 15% to validation

def train_test_valid_split(images):
    images_train = images[: int(np.round(len(images)*0.7))]
    images_test = images[int(np.round(len(images)*0.7)): int(np.round(len(images)*0.7))+int(np.round(len(images)*0.15))]
    images_valid = images[int(np.round(len(images)*0.7))+int(np.round(len(images)*0.15)) : ]
    return images_train, images_test, images_valid

In [15]:
# applying the train, test and validation split on each class of images
ak_train, ak_test, ak_valid = train_test_valid_split(ak_images)
bcc_train, bcc_test, bcc_valid = train_test_valid_split(bcc_images)
bkl_train, bkl_test, bkl_valid = train_test_valid_split(bkl_images)
df_train, df_test, df_valid = train_test_valid_split(df_images)
mel_train, mel_test, mel_valid = train_test_valid_split(mel_images)
nv_train, nv_test, nv_valid = train_test_valid_split(nv_images)
scc_train, scc_test, scc_valid = train_test_valid_split(scc_images)
vasc_train, vasc_test, vasc_valid = train_test_valid_split(vasc_images)

In [18]:
# define a function to read the images from the paths and saves in the specified save path folder:

def read_save(images, save_path):

    for x in tqdm(images, total= len(images)):
        # First, we have to extract the image name, image extention.
        name = x.split("\\")[-1].split(".")
        image_name = name[0]
        image_extn = name[1]

        # Now that we have the names, we have to read the image and the masks 

        x = cv2.imread(x, cv2.IMREAD_COLOR)

        save_images = [x] # save original image

        for i in save_images:
            tmp_img_name = f"{image_name}.{image_extn}"
            image_path = os.path.join(save_path, tmp_img_name)
            cv2.imwrite(image_path, i)

In [None]:
# Read and save all the training data:

save_path = "Data/train_70%/"

read_save(ak_train, save_path=save_path+"ak")
read_save(bcc_train, save_path=save_path+"bcc")
read_save(bkl_train, save_path=save_path+"bkl")
read_save(df_train, save_path=save_path+"df")
read_save(mel_train, save_path=save_path+"mel")
read_save(nv_train, save_path=save_path+"nv")
read_save(scc_train, save_path=save_path+"scc")
read_save(vasc_train, save_path=save_path+"vasc")

In [None]:
# Read and save all the testing data:

save_path = "Data/Processed_Data/test"

read_save(ak_test, save_path=save_path+"ak")
read_save(bcc_test, save_path=save_path+"bcc")
read_save(bkl_test, save_path=save_path+"bkl")
read_save(df_test, save_path=save_path+"df")
read_save(mel_test, save_path=save_path+"mel")
read_save(nv_test, save_path=save_path+"nv")
read_save(scc_test, save_path=save_path+"scc")
read_save(vasc_test, save_path=save_path+"vasc")

In [None]:
# Read and save all the validation data:

save_path = "Data/Processed_Data/valid"

read_save(ak_valid, save_path=save_path+"ak")
read_save(bcc_valid, save_path=save_path+"bcc")
read_save(bkl_valid, save_path=save_path+"bkl")
read_save(df_valid, save_path=save_path+"df")
read_save(mel_valid, save_path=save_path+"mel")
read_save(nv_valid, save_path=save_path+"nv")
read_save(scc_valid, save_path=save_path+"scc")
read_save(vasc_valid, save_path=save_path+"vasc")