## This notebook preprocesses the ISIC 2019 dataset for skin lesion classification. The steps include:
- Loading and cleaning the dataset
- Splitting data into train, validation, and test sets
- Computing class weights to handle imbalanced classes
- Setting up data augmentation for training
- Saving processed data for reproducibility

In [1]:
import pandas as pd
import numpy as np
import os
import cv2 as cv
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2025-10-22 18:11:43.112560: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define paths to dataset
data_path = '../data/ISIC_2019_Training_Input' # Directory containing images
csv_path = '../data/ISIC_2019_Training_GroundTruth.csv'  # Ground truth CSV file

In [3]:
assert os.path.exists(csv_path), f"File {csv_path} not found!"
assert os.path.exists(data_path), f"File {data_path} not found!"

In [4]:
# Step 1: Load and clean the dataset
print('Loading and Cleaning data...')
df = pd.read_csv(csv_path)

Loading and Cleaning data...


In [5]:
# Remove rows with invalid labels (ensure exactly one label per row)
df = df[df[['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']].sum(axis=1) == 1]

In [6]:
invalid_rows = len(pd.read_csv(csv_path)) - len(df)
print(f"Count of deleted rows: {invalid_rows}")

Count of deleted rows: 0


In [7]:
# Add image paths to DataFrame for easy access
df['image_path'] = df['image'].apply(lambda x: os.path.join(data_path, x + '.jpg'))

In [8]:
df.head()

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,image_path
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/ISIC_2019_Training_Input/ISIC_0000000.jpg
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/ISIC_2019_Training_Input/ISIC_0000001.jpg
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/ISIC_2019_Training_Input/ISIC_0000002.jpg
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/ISIC_2019_Training_Input/ISIC_0000003.jpg
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/ISIC_2019_Training_Input/ISIC_0000004.jpg


In [9]:
# Check for invalid or missing images
def check_image(img_path):
    """Check if an image file exists and can be read."""
    if not os.path.exists(img_path):
        return False
    img = cv.imread(img_path)
    return img is not None

df['valid_image'] = df['image_path'].apply(check_image)
# df = df[df['valid_image']].drop(columns = ['valid_image'])
print(f"Total valid images: {len(df)}")

Total valid images: 25331


In [10]:
invalid_images = df[~df['valid_image']]['image'].tolist()
if invalid_images:
    print(f"invalid_images: {invalid_images}")

In [11]:
df = df[df['valid_image']].drop(columns = ['valid_image'])

In [12]:
# Step 2: Convert one-hot encoded labels to categorical
class_columns = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC', 'UNK']
df['label'] = df[class_columns].idxmax(axis=1)

In [13]:
# Step 3: Split data into train (70%), validation (15%) and test (15%) sets
print('Splitting data...')
train_df, temp_df = train_test_split(df, test_size = 0.3, stratify = df['label'], random_state = 42)
val_df, test_df = train_test_split(temp_df, test_size = 0.5, stratify = temp_df['label'], random_state = 42)
print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

Splitting data...
Train: 17731, Validation: 3800, Test: 3800


In [14]:
print("Class distribution Train:")
print(train_df['label'].value_counts())
print("Class distribution Validation:")
print(val_df['label'].value_counts())
print("Class distribution Test:")
print(test_df['label'].value_counts())

Class distribution Train:
label
NV      9012
MEL     3165
BCC     2326
BKL     1837
AK       607
SCC      440
VASC     177
DF       167
Name: count, dtype: int64
Class distribution Validation:
label
NV      1932
MEL      678
BCC      498
BKL      394
AK       130
SCC       94
VASC      38
DF        36
Name: count, dtype: int64
Class distribution Test:
label
NV      1931
MEL      679
BCC      499
BKL      393
AK       130
SCC       94
VASC      38
DF        36
Name: count, dtype: int64


In [15]:
# Step 4: Compute class weights to handle imbalanced classes
print("Computing class weights...")
classes = np.unique(df['label'])
print(classes)
class_weights = compute_class_weight('balanced', classes = classes, y = df['label'])
class_weight_dict = dict(zip(classes, class_weights))
print("Class weights:", class_weight_dict)

Computing class weights...
['AK' 'BCC' 'BKL' 'DF' 'MEL' 'NV' 'SCC' 'VASC']
Class weights: {'AK': 3.652104959630911, 'BCC': 0.9528663857959675, 'BKL': 1.2066977896341464, 'DF': 13.248430962343097, 'MEL': 0.7002156125608138, 'NV': 0.24593203883495146, 'SCC': 5.041998407643312, 'VASC': 12.515316205533598}


In [16]:
# Step 5: Set up data augmentation for training and normalization for validation/test
train_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.resnet50.preprocess_input,    rotation_range = 20, # Random rotation up to 20 degrees
    width_shift_range = 0.2, # Random horizontal shift
    height_shift_range = 0.2, # Random vertical shift
    horizontal_flip = True, # Random horizontal flip
    zoom_range = 0.2, # Random zoom
    fill_mode = 'nearest' # Fill new pixels with nearest value
)

val_test_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.resnet50.preprocess_input) # Only normalize for validation/test

# Configure data generators
target_size = (224, 224) # Resize images for EfficientNetB0
batch_size = 16 

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col = 'image_path',
    y_col = 'label',
    target_size = target_size,
    batch_size = batch_size,
    class_mode = 'categorical'
)

val_generator = val_test_datagen.flow_from_dataframe(
    val_df,
    x_col = 'image_path',
    y_col = 'label',
    target_size = target_size,
    batch_size = batch_size,
    class_mode = 'categorical'
)

test_generator = val_test_datagen.flow_from_dataframe(
    test_df,
    x_col = 'image_path',
    y_col = 'label',
    target_size = target_size,
    batch_size = batch_size,
    class_mode = 'categorical',
    shuffle = False # Keep test data order to evaluation   
)

print("Class indices:", train_generator.class_indices)

Found 17731 validated image filenames belonging to 8 classes.
Found 3800 validated image filenames belonging to 8 classes.
Found 3800 validated image filenames belonging to 8 classes.
Class indices: {'AK': 0, 'BCC': 1, 'BKL': 2, 'DF': 3, 'MEL': 4, 'NV': 5, 'SCC': 6, 'VASC': 7}


In [17]:
# Step 6: Save processed DataFrames for reproducibility
train_df.to_csv('train_split.csv', index = False)
val_df.to_csv('val_split.csv', index = False)
test_df.to_csv('test_split.csv', index = False)
print('Data splits saved as CSV.')

Data splits saved as CSV.


In [18]:
# Step 7: Save class weights for training
np.save('class_weights.npy', class_weight_dict)
print('Class weights saved.')

Class weights saved.


In [19]:
assert os.path.exists('train_split.csv'), "train_split.csv didn't save!"