In [17]:
from pathlib import Path
DATA_DIR = Path(r"E:\skin-project\datasets\ham10000")
IMAGES_DIR = DATA_DIR / "images"
print("DATA_DIR exists:", DATA_DIR.exists())
print("IMAGES_DIR exists:", IMAGES_DIR.exists())
print("Number of image files (first 100k):", sum(1 for _ in IMAGES_DIR.rglob("*") if _.is_file()))
print("Sample image files:", [str(p.name) for p in list(IMAGES_DIR.rglob("*"))[:10]])
print("CSV files:", [p.name for p in DATA_DIR.glob("*.csv")])


DATA_DIR exists: True
IMAGES_DIR exists: True
Number of image files (first 100k): 10015
Sample image files: ['ISIC_0024306.jpg', 'ISIC_0024307.jpg', 'ISIC_0024308.jpg', 'ISIC_0024309.jpg', 'ISIC_0024310.jpg', 'ISIC_0024311.jpg', 'ISIC_0024312.jpg', 'ISIC_0024313.jpg', 'ISIC_0024314.jpg', 'ISIC_0024315.jpg']
CSV files: ['HAM10000_metadata.csv', 'hmnist_28_28_L.csv', 'hmnist_28_28_RGB.csv', 'hmnist_8_8_L.csv', 'hmnist_8_8_RGB.csv']


In [18]:
import pandas as pd
from pathlib import Path
DATA_DIR = Path(r"E:\skin-project\datasets\ham10000")
# prefer HAM10000_metadata.csv if present, else pick the largest CSV
candidates = list(DATA_DIR.glob("*metadata*.csv")) + list(DATA_DIR.glob("*.csv"))
# pick the first metadata-like, else the largest
meta = None
for c in candidates:
    if "metadata" in c.name.lower():
        meta = c
        break
if meta is None:
    meta = max(candidates, key=lambda p: p.stat().st_size)

print("Using metadata file:", meta.name, "size:", meta.stat().st_size)
df_meta_raw = pd.read_csv(meta)
print("Raw metadata shape:", df_meta_raw.shape)
display(df_meta_raw.head(5))


Using metadata file: HAM10000_metadata.csv size: 563277
Raw metadata shape: (10015, 7)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [19]:
from pathlib import Path
IMAGES_DIR = DATA_DIR / "images"

def find_image_path(img_id, root=IMAGES_DIR):
    # try common extensions first (fast)
    for ext in [".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"]:
        p = root / f"{img_id}{ext}"
        if p.exists():
            return str(p)
    # fallback: recursive scan for filename containing id
    hit = next(root.rglob(f"*{img_id}*"), None)
    return str(hit) if hit is not None else None

# copy raw to working df
df = df_meta_raw.copy()
# ensure image_id column name
if "image_id" not in df.columns and "image" in df.columns:
    df = df.rename(columns={"image":"image_id"})

# create image_path
df["image_path"] = df["image_id"].apply(lambda x: find_image_path(str(x)))
print("Total rows:", len(df))
print("Found image_path for:", df["image_path"].notnull().sum(), "rows")
print("Missing image_path:", df["image_path"].isnull().sum())
# show first 10 missing ids (if any)
if df["image_path"].isnull().sum() > 0:
    display(df.loc[df["image_path"].isnull(), ["image_id"]].head(20))
else:
    display(df.head(5))


Total rows: 10015
Found image_path for: 10015 rows
Missing image_path: 0


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,E:\skin-project\datasets\ham10000\images\ISIC_...


In [20]:
TARGET_CLASSES = ["MEL","NV","BCC","AKIEC","BKL","DF","VASC"]

# show unique raw dx values
print("Raw dx unique sample:", sorted(df['dx'].dropna().unique())[:50])

# try simple normalization: uppercase + strip, map common long names to short codes
df['dx_norm'] = df['dx'].astype(str).str.upper().str.strip()

# mapping guesses (extend if your CSV uses other terms)
map_guess = {
    'MELANOMA': 'MEL',
    'MEL': 'MEL',
    'NEVUS': 'NV',
    'NV': 'NV',
    'MELANOCYTIC NEVUS': 'NV',
    'BASAL CELL CARCINOMA': 'BCC',
    'BCC': 'BCC',
    'AKIEC': 'AKIEC',  # actinic keratoses and intraepithelial carcinoma sometimes abbreviated
    'KERATOSIS': 'BKL',
    'BKL': 'BKL',
    'DERMATOFIBROMA': 'DF',
    'VASCULAR LESION': 'VASC',
    'VASC': 'VASC'
}

df['dx_mapped'] = df['dx_norm'].map(map_guess).fillna(df['dx_norm'])
# Count how many map to our TARGET_CLASSES
mapped_mask = df['dx_mapped'].isin(TARGET_CLASSES)
print("Rows mapping to TARGET_CLASSES:", mapped_mask.sum(), " / ", len(df))
print("Examples of dx_mapped values (unique sample):", sorted(df['dx_mapped'].unique())[:50])
# Filter to target classes
df = df.loc[mapped_mask].copy()
df['dx'] = df['dx_mapped']  # replace dx with mapped short codes
df.drop(columns=['dx_norm','dx_mapped'], inplace=True, errors='ignore')
print("After filtering to TARGET_CLASSES, rows:", len(df))
df.head(5)


Raw dx unique sample: ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']
Rows mapping to TARGET_CLASSES: 10015  /  10015
Examples of dx_mapped values (unique sample): ['AKIEC', 'BCC', 'BKL', 'DF', 'MEL', 'NV', 'VASC']
After filtering to TARGET_CLASSES, rows: 10015


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,image_path
0,HAM_0000118,ISIC_0027419,BKL,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
1,HAM_0000118,ISIC_0025030,BKL,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
2,HAM_0002730,ISIC_0026769,BKL,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
3,HAM_0002730,ISIC_0025661,BKL,histo,80.0,male,scalp,E:\skin-project\datasets\ham10000\images\ISIC_...
4,HAM_0001466,ISIC_0031633,BKL,histo,75.0,male,ear,E:\skin-project\datasets\ham10000\images\ISIC_...


In [21]:
# ensure image_path not null and file exists
df = df.dropna(subset=['image_path']).reset_index(drop=True)
print("Rows after dropping missing images:", len(df))

# quick sanity: check file exists on disk for a sample
import os
sample_paths = df['image_path'].values[:5]
for p in sample_paths:
    print(p, "exists?", os.path.exists(p))

# now perform stratified split
from sklearn.model_selection import StratifiedShuffleSplit
RANDOM_STATE = 42
test_size = 0.15
val_size = 0.15

y = df['dx'].values
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=RANDOM_STATE)
for train_idx, test_idx in sss1.split(df, y):
    df_trainval = df.iloc[train_idx].reset_index(drop=True)
    df_test = df.iloc[test_idx].reset_index(drop=True)

relative_val = val_size / (1 - test_size)
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=relative_val, random_state=RANDOM_STATE)
y2 = df_trainval['dx'].values
for train_idx, val_idx in sss2.split(df_trainval, y2):
    df_train = df_trainval.iloc[train_idx].reset_index(drop=True)
    df_val = df_trainval.iloc[val_idx].reset_index(drop=True)

print("Train, Val, Test sizes:", df_train.shape, df_val.shape, df_test.shape)


Rows after dropping missing images: 10015
E:\skin-project\datasets\ham10000\images\ISIC_0027419.jpg exists? True
E:\skin-project\datasets\ham10000\images\ISIC_0025030.jpg exists? True
E:\skin-project\datasets\ham10000\images\ISIC_0026769.jpg exists? True
E:\skin-project\datasets\ham10000\images\ISIC_0025661.jpg exists? True
E:\skin-project\datasets\ham10000\images\ISIC_0031633.jpg exists? True
Train, Val, Test sizes: (7009, 8) (1503, 8) (1503, 8)


In [22]:
label_map = {c:i for i,c in enumerate(TARGET_CLASSES)}
for name, d in [('train', df_train), ('val', df_val), ('test', df_test)]:
    out = d[['image_id','image_path','dx']].copy()
    out['label'] = out['dx'].map(label_map)
    out_path = DATA_DIR / f"{name}.csv"
    out.to_csv(out_path, index=False)
    print("Saved", out_path, "rows:", len(out))


Saved E:\skin-project\datasets\ham10000\train.csv rows: 7009
Saved E:\skin-project\datasets\ham10000\val.csv rows: 1503
Saved E:\skin-project\datasets\ham10000\test.csv rows: 1503
