In [1]:
# ============ Option A: Class-aware downsample ============

import os, random, numpy as np, pandas as pd
from pathlib import Path

# Paths (adjust as needed)
CSV_PATH = r"C:/Users/chait/Downloads/melanoma_classification/data/train.csv"
IMG_DIR  = Path(r"C:/Users/chait/Downloads/melanoma_classification/data/jpeg/train")

assert os.path.isfile(CSV_PATH), f"CSV not found: {CSV_PATH}"
assert IMG_DIR.exists(), f"Image directory not found: {IMG_DIR}"

# Load full training CSV
df = pd.read_csv(CSV_PATH, low_memory=False)

# Class-aware subset
N_TOTAL = 1000  # set desired total rows in subset
pos_df = df[df["target"] == 1].copy()
neg_df = df[df["target"] == 0].copy()

# Aim for ~20% positives in the subset if available; adjust as desired
n_pos = min(len(pos_df), max(1, int(0.2 * N_TOTAL)))
n_neg = min(len(neg_df), N_TOTAL - n_pos)

pos_s = pos_df.sample(n=n_pos, random_state=42) if n_pos > 0 else pos_df.head(0)
neg_s = neg_df.sample(n=n_neg, random_state=42) if n_neg > 0 else neg_df.head(0)
df = pd.concat([pos_s, neg_s], axis=0).sample(frac=1.0, random_state=42).reset_index(drop=True)

print("Subset sizes:", len(df), "positives:", int(df['target'].sum()), "pos_rate:", df['target'].mean())


Subset sizes: 1000 positives: 200 pos_rate: 0.2


In [2]:
# ============ Preprocess metadata and build features ============

# Fill missing metadata
df["sex"] = df["sex"].fillna("Unknown")
df["anatom_site_general_challenge"] = df["anatom_site_general_challenge"].fillna("Unknown")
df["age_approx"] = df["age_approx"].fillna(df["age_approx"].median())

# Build vocab from subset
cat_cols = ["sex", "anatom_site_general_challenge"]
cat_vocab = {}
for c in cat_cols:
    cats = sorted(df[c].astype(str).unique().tolist())
    cat_vocab[c] = {v: i for i, v in enumerate(cats)}

def build_meta_features(ddf: pd.DataFrame) -> np.ndarray:
    sex_idx  = ddf["sex"].astype(str).map(cat_vocab["sex"]).astype(int).values
    site_idx = ddf["anatom_site_general_challenge"].astype(str).map(cat_vocab["anatom_site_general_challenge"]).astype(int).values
    sex_oh  = np.eye(len(cat_vocab["sex"]), dtype=np.float32)[sex_idx]
    site_oh = np.eye(len(cat_vocab["anatom_site_general_challenge"]), dtype=np.float32)[site_idx]
    age = np.clip(ddf["age_approx"].values.astype(np.float32) / 100.0, 0.0, 1.0).reshape(-1, 1)
    return np.concatenate([age, sex_oh, site_oh], axis=1).astype(np.float32)

meta = build_meta_features(df)
META_DIM = meta.shape[1]
print("META_DIM:", META_DIM)


META_DIM: 11


In [3]:
# ============ Patient-grouped stratified folds ============

from sklearn.model_selection import StratifiedKFold

assert "patient_id" in df.columns and "target" in df.columns, "CSV must include patient_id and target."

pt = df.groupby("patient_id").agg(n=("image_name", "count"), y=("target", "max")).reset_index()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
pt["fold"] = -1
for fold, (_, val_idx) in enumerate(skf.split(pt, pt["y"])):
    pt.loc[val_idx, "fold"] = fold

df = df.merge(pt[["patient_id", "fold"]], on="patient_id", how="left")
print("Fold sizes (images):\n", df["fold"].value_counts().sort_index())


Fold sizes (images):
 fold
0    197
1    203
2    192
3    212
4    196
Name: count, dtype: int64


In [4]:
# ============ Transforms and Dataset (light, fast) ============

import cv2, albumentations as A
from albumentations.pytorch import ToTensorV2
import torch
from torch.utils.data import Dataset, DataLoader

IMG_SIZE = 320  # smaller for speed
MEAN, STD = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]

train_tfms = A.Compose([
    A.Resize(height=IMG_SIZE, width=IMG_SIZE, interpolation=cv2.INTER_CUBIC),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=MEAN, std=STD),
    ToTensorV2()
])
valid_tfms = A.Compose([
    A.Resize(height=IMG_SIZE, width=IMG_SIZE, interpolation=cv2.INTER_CUBIC),
    A.Normalize(mean=MEAN, std=STD),
    ToTensorV2()
])

class MelanomaDataset(Dataset):
    def __init__(self, df_in, img_dir, meta_array, tfms):
        self.df = df_in.reset_index(drop=True)
        self.img_dir = Path(img_dir)
        self.meta = meta_array
        self.tfms = tfms
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        p = self.img_dir / f"{r.image_name}.jpg"
        img = cv2.imread(str(p))
        if img is None:
            img = np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        x = self.tfms(image=img)["image"]
        m = torch.tensor(self.meta[idx], dtype=torch.float32)
        y = torch.tensor(float(r.target), dtype=torch.float32)
        return x, m, y

def make_loaders_for_fold_quick(fold, batch_size=8):
    arr = df["fold"].values
    trn_idx = np.where(arr != fold)[0]
    val_idx = np.where(arr == fold)
    dtr, dva = df.iloc[trn_idx], df.iloc[val_idx]
    mtr, mva = meta[trn_idx], meta[val_idx]

    train_ds = MelanomaDataset(dtr, IMG_DIR, mtr, train_tfms)
    valid_ds = MelanomaDataset(dva, IMG_DIR, mva, valid_tfms)

    # Windows-safe: num_workers=0, pin_memory=False
    tl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
    vl = DataLoader(valid_ds, batch_size=batch_size*2, shuffle=False, num_workers=0, pin_memory=False)
    return tl, vl, trn_idx, val_idx

# Smoke test
tl, vl, tri, vai = make_loaders_for_fold_quick(fold=int(df["fold"].unique()[0]), batch_size=8)
xb, mb, yb = next(iter(tl))
print("Batch shapes:", tuple(xb.shape), tuple(mb.shape), tuple(yb.shape))


  from .autonotebook import tqdm as notebook_tqdm


Batch shapes: (8, 3, 320, 320) (8, 11) (8,)


In [5]:
# ============ Metadata-only quick baseline ============

import torch.nn as nn
from sklearn.metrics import roc_auc_score, average_precision_score

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def roc_pr(y_true, y_prob):
    return roc_auc_score(y_true, y_prob), average_precision_score(y_true, y_prob)

def pos_weight_from_labels(y):
    y = y.astype(np.float32)
    pos = y.sum()
    neg = len(y) - pos
    return float(np.sqrt((neg + 1e-6) / (pos + 1e-6)))

class MetaMLP(nn.Module):
    def __init__(self, dim_in):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim_in, 64), nn.ReLU(True), nn.BatchNorm1d(64), nn.Dropout(0.2),
            nn.Linear(64, 32), nn.ReLU(True), nn.BatchNorm1d(32), nn.Dropout(0.1),
            nn.Linear(32, 1)
        )
    def forward(self, m):
        return self.net(m).squeeze(1)

def train_meta_once(train_loader, valid_loader, epochs=2, lr=1e-3, pos_weight=1.0):
    model = MetaMLP(META_DIM).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    bce = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=DEVICE))
    best_auc, best_p, best_y = -1, None, None
    for ep in range(epochs):
        model.train()
        for _, mb, yb in train_loader:
            mb, yb = mb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            loss = bce(model(mb), yb)
            loss.backward(); opt.step()
        model.eval(); probs=[]; ys=[]
        with torch.no_grad():
            for _, mva, yva in valid_loader:
                p = torch.sigmoid(model(mva.to(DEVICE))).cpu().numpy()
                probs.append(p); ys.append(yva.numpy())
        y = np.concatenate(ys); p = np.concatenate(probs)
        auc, ap = roc_pr(y, p)
        best_auc, best_p, best_y = (auc, p, y) if auc > best_auc else (best_auc, best_p, best_y)
        print(f"[Meta] ep {ep} AUC {auc:.4f} | AP {ap:.4f}")
    return best_auc, best_y, best_p

# Run on a single fold for speed
FOLD = int(df["fold"].unique()[0])
train_loader, valid_loader, trn_idx, val_idx = make_loaders_for_fold_quick(FOLD, batch_size=8)
pos_w = pos_weight_from_labels(df.iloc[trn_idx]["target"].values)
best_auc_meta, yv_m, pv_m = train_meta_once(train_loader, valid_loader, epochs=2, lr=1e-3, pos_weight=pos_w)
print(f"[Meta] Fold {FOLD} best AUC: {best_auc_meta:.4f}")


[Meta] ep 0 AUC 0.7293 | AP 0.4017
[Meta] ep 1 AUC 0.7091 | AP 0.4445
[Meta] Fold 0 best AUC: 0.7293


In [6]:
# ============ Image+metadata quick model ============

# If timm is not available, test with a tiny CNN first (comment block below).
USE_EFFICIENTNET = True

if USE_EFFICIENTNET:
    import timm
    from torch.cuda.amp import autocast, GradScaler

    class ImgMetaModel(nn.Module):
        def __init__(self, backbone="tf_efficientnet_b0", meta_dim=META_DIM, dropout=0.2, pretrained=True):
            super().__init__()
            self.backbone = timm.create_model(backbone, pretrained=pretrained, num_classes=0, global_pool="avg")
            feat_dim = self.backbone.num_features
            self.img_do = nn.Dropout(dropout)
            self.meta = nn.Sequential(
                nn.Linear(meta_dim, 64), nn.ReLU(True), nn.BatchNorm1d(64), nn.Dropout(0.1),
                nn.Linear(64, 32), nn.ReLU(True)
            )
            self.head = nn.Sequential(nn.Linear(feat_dim + 32, 1))
        def forward(self, x, m):
            f = self.img_do(self.backbone(x))
            g = self.meta(m)
            return self.head(torch.cat([f, g], dim=1)).squeeze(1)

    def train_img_once(train_loader, valid_loader, epochs=2, lr=3e-4, pos_weight=1.0, backbone="tf_efficientnet_b0"):
        model = ImgMetaModel(backbone=backbone, meta_dim=META_DIM, pretrained=True).to(DEVICE)
        opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
        bce = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=DEVICE))
        scaler = GradScaler()
        best_auc, best_p, best_y = -1, None, None
        for ep in range(epochs):
            model.train()
            for xb, mb, yb in train_loader:
                xb, mb, yb = xb.to(DEVICE), mb.to(DEVICE), yb.to(DEVICE)
                opt.zero_grad(set_to_none=True)
                with autocast():
                    loss = bce(model(xb, mb), yb)
                scaler.scale(loss).backward()
                scaler.step(opt); scaler.update()
            model.eval(); probs=[]; ys=[]
            with torch.no_grad():
                for xb, mb, yb in valid_loader:
                    p = torch.sigmoid(model(xb.to(DEVICE), mb.to(DEVICE))).cpu().numpy()
                    probs.append(p); ys.append(yb.numpy())
            y = np.concatenate(ys); p = np.concatenate(probs)
            auc, ap = roc_pr(y, p)
            best_auc, best_p, best_y = (auc, p, y) if auc > best_auc else (best_auc, best_p, best_y)
            print(f"[Img+Meta b0] ep {ep} AUC {auc:.4f} | AP {ap:.4f}")
        return best_auc, best_y, best_p

    best_auc_img, yv_i, pv_i = train_img_once(train_loader, valid_loader, epochs=2, lr=3e-4, pos_weight=pos_w, backbone="tf_efficientnet_b0")
    print(f"[Img+Meta b0] Fold {FOLD} best AUC: {best_auc_img:.4f}")

else:
    # Tiny fallback if timm not available
    class TinyCNN(nn.Module):
        def __init__(self, meta_dim):
            super().__init__()
            self.conv = nn.Sequential(
                nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(True), nn.MaxPool2d(2),
                nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(True), nn.AdaptiveAvgPool2d(1)
            )
            self.meta = nn.Sequential(nn.Linear(meta_dim, 32), nn.ReLU(True))
            self.head = nn.Linear(32 + 32, 1)
        def forward(self, x, m):
            f = self.conv(x).flatten(1)
            g = self.meta(m)
            return self.head(torch.cat([f, g], dim=1)).squeeze(1)

    def train_tiny_once(train_loader, valid_loader, epochs=2, lr=1e-3, pos_weight=1.0):
        model = TinyCNN(META_DIM).to(DEVICE)
        opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
        bce = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=DEVICE))
        best_auc, best_p, best_y = -1, None, None
        for ep in range(epochs):
            model.train()
            for xb, mb, yb in train_loader:
                xb, mb, yb = xb.to(DEVICE), mb.to(DEVICE), yb.to(DEVICE)
                opt.zero_grad(set_to_none=True)
                loss = bce(model(xb, mb), yb)
                loss.backward(); opt.step()
            model.eval(); probs=[]; ys=[]
            with torch.no_grad():
                for xb, mb, yb in valid_loader:
                    p = torch.sigmoid(model(xb.to(DEVICE), mb.to(DEVICE))).cpu().numpy()
                    probs.append(p); ys.append(yb.numpy())
            y = np.concatenate(ys); p = np.concatenate(probs)
            auc, ap = roc_pr(y, p)
            best_auc, best_p, best_y = (auc, p, y) if auc > best_auc else (best_auc, best_p, best_y)
            print(f"[Img+Meta Tiny] ep {ep} AUC {auc:.4f} | AP {ap:.4f}")
        return best_auc, best_y, best_p

    best_auc_img, yv_i, pv_i = train_tiny_once(train_loader, valid_loader, epochs=2, lr=1e-3, pos_weight=pos_w)
    print(f"[Img+Meta Tiny] Fold {FOLD} best AUC: {best_auc_img:.4f}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  scaler = GradScaler()
  with autocast():


[Img+Meta b0] ep 0 AUC 0.9168 | AP 0.7584


  with autocast():


[Img+Meta b0] ep 1 AUC 0.8637 | AP 0.6755
[Img+Meta b0] Fold 0 best AUC: 0.9168
