In [2]:
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [3]:
md_path = "../data/raw/skin-cancer-mnist-ham10000/HAM10000_metadata.csv"
img_path_p1 = "../data/raw/skin-cancer-mnist-ham10000/HAM10000_images_part_1"
img_path_p2 = "../data/raw/skin-cancer-mnist-ham10000/HAM10000_images_part_2"

In [4]:
md = pd.read_csv(md_path)

In [5]:
# Function to add path column to metadata
def get_image_path(img_id):
    path_p1 = os.path.join(img_path_p1 + f"/{img_id}.jpg")
    path_p2 = os.path.join(img_path_p2 + f"/{img_id}.jpg")

    if os.path.exists(path_p1):
        # Image in part 1 folder
        return path_p1
    elif os.path.exists(path_p2):
        # Image in part 2 folder
        return path_p2
    
    # Image doesn't exist
    return None

In [6]:
# Apply all image paths to metadata
# and filter any where path wasn't found

md["path"] = md["image_id"].apply(get_image_path)
md.dropna(subset=["path"], inplace=True)

print(md.head()["path"])

0    ../data/raw/skin-cancer-mnist-ham10000/HAM1000...
1    ../data/raw/skin-cancer-mnist-ham10000/HAM1000...
2    ../data/raw/skin-cancer-mnist-ham10000/HAM1000...
3    ../data/raw/skin-cancer-mnist-ham10000/HAM1000...
4    ../data/raw/skin-cancer-mnist-ham10000/HAM1000...
Name: path, dtype: object


In [7]:
# Load and preprocess images
images = []
img_size = 128

for path in md["path"]:
    img = Image.open(path)
    img = img.resize((img_size, img_size)) # Make image square
    images.append(np.array(img))

images = np.array(images)

In [8]:
# Encode the disease labels into numerical format
le = LabelEncoder()
md["label_numeric"] = le.fit_transform(md["dx"])
print(set(md["label_numeric"]))

# All labels in a numpy array
labels = md["label_numeric"].values


{0, 1, 2, 3, 4, 5, 6}


For the Train-Test split I will use a 70-15-15 split, 70% for train split, 15% for validation and 15% for test.

stratify parameter will address the imbalance problem of this dataset as it will ensure that the proportion of each class in the split is the same as the original dataset.

In [9]:
# Split Train to 70 and Temp (val + test) to 30
x_train, x_temp, y_train, y_temp = train_test_split(
    images,
    labels,
    test_size=0.3,
    random_state = 37, # Set for reproducibility and testing
    stratify = labels
)

# Split Temp (30) to Validation (15) and Testing (15)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp,
    y_temp,
    test_size = 0.5,
    random_state = 37,
    stratify = y_temp 
)

In [10]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(7010, 128, 128, 3)
(1502, 128, 128, 3)
(1503, 128, 128, 3)


Considering the dataset comprises of ~10,000 images, these numbers indicate the dataset has been split correctly.

~7000 = 70% Train

~1500 = 15% Validate

~1500 = 15% Test

In [16]:
# Save preprocessed data
num_classes = len(set(md["dx"]))
print("Saving preprocessed data...")
np.save("../data/processed/x_train.npy", x_train)
np.save("../data/processed/y_train.npy", y_train)
np.save("../data/processed/x_val.npy", x_val)
np.save("../data/processed/y_val.npy", y_val)
np.save("../data/processed/x_test.npy", x_test)
np.save("../data/processed/y_test.npy", y_test)
print("Preprocessed data successfully saved.")



Saving preprocessed data...
Preprocessed data successfully saved.
