## Mount Google Drive

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Set Project Directory and Change Path

In [2]:
import os

project_dir = '/content/drive/MyDrive/Skin-Cancer-Detection'
os.chdir(project_dir)

## Data Loading

In [3]:
import pandas as pd

metadata_path = "data/HAM10000_metadata.csv"
df = pd.read_csv(metadata_path)

print(f"Total images: {len(df)}")
print(f"Unique lesions: {df['lesion_id'].nunique()}")

Total images: 10015
Unique lesions: 7470


## Leakage-Free Train/Val/Test Split (Stratified by Lesion)

In [4]:
from sklearn.model_selection import train_test_split

# One row per unique lesion
unique_lesions = df.drop_duplicates(subset=['lesion_id'])

# First split: train vs. temp (val+test)
train_lesions, temp_lesions = train_test_split(
    unique_lesions, test_size=0.3, stratify=unique_lesions['dx'], random_state=55
)
# Second split: temp -> val and test
val_lesions, test_lesions = train_test_split(
    temp_lesions, test_size=0.5, stratify=temp_lesions['dx'], random_state=55
)

In [5]:
# Assign images according to lesion splits
train = df[df['lesion_id'].isin(train_lesions['lesion_id'])].copy()
val = df[df['lesion_id'].isin(val_lesions['lesion_id'])].copy()
test = df[df['lesion_id'].isin(test_lesions['lesion_id'])].copy()

In [6]:
# Save splits to CSV
train.to_csv("data/train_meta.csv", index=False)
val.to_csv("data/val_meta.csv", index=False)
test.to_csv("data/test_meta.csv", index=False)

In [7]:
print("Leakage-free Train/Val/Test split completed.")
print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
print(f"Train unique lesions: {train['lesion_id'].nunique()}, Val unique: {val['lesion_id'].nunique()}, Test unique: {test['lesion_id'].nunique()}")

Leakage-free Train/Val/Test split completed.
Train: 7019, Val: 1504, Test: 1492
Train unique lesions: 5229, Val unique: 1120, Test unique: 1121


## Verify No Lesion Overlap Between Splits

In [8]:
all_ids = set(train['lesion_id']) | set(val['lesion_id']) | set(test['lesion_id'])
overlap = (set(train['lesion_id']) & set(val['lesion_id'])) | \
          (set(train['lesion_id']) & set(test['lesion_id'])) | \
          (set(val['lesion_id']) & set(test['lesion_id']))
print("Overlap between splits:", overlap)

if len(overlap) == 0:
    print("No leakage: splits are completely disjoint by lesion.")
else:
    print("Leakage detected! Overlapping lesion IDs found.")

Overlap between splits: set()
No leakage: splits are completely disjoint by lesion.
