In [None]:
import os
import random
import cv2
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as T
import torchvision.transforms.functional as F_v
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler, ConcatDataset
from torchvision import models
from tqdm.auto import tqdm

RANDOM_SEED = 42

# Python, NumPy, and Torch seed, ensure deterministic behaviour
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

In [None]:
TRAIN_METADATA_CSV = "/kaggle/input/kaggleisic-challenge/new-train-metadata.csv"
TEST_METADATA_CSV = "/kaggle/input/kaggleisic-challenge/students-test-metadata.csv"
TRAIN_METADATA_PROCESSED_CSV = (
    "/kaggle/input/kaggleisic-challenge/train-metadata-processed.csv"
)
TEST_METADATA_PROCESSED_CSV = (
    "/kaggle/input/kaggleisic-challenge/test-metadata-processed.csv"
)
TRAIN_HDF5 = "/kaggle/input/kaggleisic-challenge/train-image.hdf5"
TEST_HDF5 = "/kaggle/input/kaggleisic-challenge/test-image.hdf5"

TRAIN_METADATA_AUGMENTED_CSV = (
    "/kaggle/input/kaggleisic-challenge/train-metadata-augmented.csv"
)
TRAIN_AUGMENTED_HDF5 = "/kaggle/input/kaggleisic-challenge/train-image-augmented.hdf5"

OUTPUT_FINAL_MODEL = "/kaggle/working/final_model.pth"
OUTPUT_FINAL_SUBMISSION = "/kaggle/working/final_submission.csv"

DROP_COLUMNS = [
    "image_type",
    "patient_id",
    "copyright_license",
    "attribution",
    "anatom_site_general",
    "tbp_lv_location_simple",
]

In [None]:
class ISIC_HDF5_Dataset(Dataset):
    """
    A PyTorch Dataset that loads images from an HDF5 file given a DataFrame of IDs.
    Applies image transforms.
    """

    def __init__(
        self, df: pd.DataFrame, hdf5_path: str, transform=None, is_labelled: bool = True
    ):
        """
        Args:
            df (pd.DataFrame): DataFrame containing 'isic_id' and optionally 'target'.
            hdf5_path (str): Path to the HDF5 file containing images.
            transform (callable): Optional transforms to be applied on a sample.
            is_labelled (bool): Whether the dataset includes labels (for train/val).
        """
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.is_labelled = is_labelled

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        isic_id = row["isic_id"]

        # Load image from HDF5
        image_rgb = self._load_image_from_hdf5(isic_id)

        # Apply transforms (PIL-style transforms require converting np array to PIL, or we can do tensor transforms)
        if self.transform is not None:
            # Convert NumPy array (H x W x C) to a PIL Imag
            image_pil = F_v.to_pil_image(image_rgb)
            image = self.transform(image_pil)
        else:
            # By default, convert it to a PIL Image
            view_transform = T.Compose([T.Resize((224, 224)), T.ToTensor()])
            image_pil = F_v.to_pil_image(image_rgb)
            image = view_transform(image_pil)

        if self.is_labelled:
            label = row["target"]
            label = torch.tensor(label).float()
            return image, label, isic_id
        else:
            return image, isic_id

    def _load_image_from_hdf5(self, isic_id: str):
        """
        Loads and decodes an image from HDF5 by isic_id.
        Returns a NumPy array in RGB format (H x W x 3).
        """
        with h5py.File(self.hdf5_path, "r") as hf:
            encoded_bytes = hf[isic_id][()]  # uint8 array

        # Decode the image bytes with OpenCV (returns BGR)
        image_bgr = cv2.imdecode(encoded_bytes, cv2.IMREAD_COLOR)
        # Convert to RGB
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
        return image_rgb

In [None]:
def load_metadata_dataset(
    train_frac=0.8, seed=42, is_subsampled=False, is_augmented=False
) -> tuple:
    if is_augmented:
        train_file = TRAIN_METADATA_AUGMENTED_CSV
    else:
        train_file = TRAIN_METADATA_PROCESSED_CSV

    # Load the metadata CSV files
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(TEST_METADATA_PROCESSED_CSV)

    # Perform stratified train/validation split to maintain class distribution
    train_dataset, valid_dataset = train_test_split(
        train_df, train_size=train_frac, stratify=train_df["target"], random_state=seed
    )

    # Reset index for train and validation datasets
    train_dataset = train_dataset.reset_index(drop=True)
    valid_dataset = valid_dataset.reset_index(drop=True)
    test_dataset = test_df.reset_index(drop=True)

    # Optionally create a balanced subset
    if is_subsampled:
        train_dataset = create_balanced_subset(train_dataset)
        valid_dataset = create_balanced_subset(valid_dataset)

    print(f"train_dataset shape: {train_dataset.shape}")
    print(f"valid_dataset shape: {valid_dataset.shape}")
    print(f"test_dataset shape:  {test_dataset.shape}")

    return train_dataset, valid_dataset, test_dataset


def load_hdf5_dataset(
    transform: T.Compose,
    train_frac=0.8,
    seed=42,
    is_subsampled=False,
    is_augmented=False,
) -> tuple[ISIC_HDF5_Dataset]:
    """
    Load the ISIC dataset from HDF5 files and split it into train, validation, and test sets.
    Args:
        transform (T.Compose): Transformations to apply to the images.
        train_frac (float): Fraction of the dataset to use for training.
        seed (int): Random seed for reproducibility.
    Returns:
        tuple: A tuple containing the train, validation, and test datasets.
    """
    # Load the metadata CSV files
    train_df_sub, valid_df_sub, test_df = load_metadata_dataset(
        train_frac=train_frac,
        seed=seed,
        is_subsampled=is_subsampled,
        is_augmented=is_augmented,
    )

    if is_augmented:
        train_file = TRAIN_AUGMENTED_HDF5
    else:
        train_file = TRAIN_HDF5

    # Create Datasets
    train_dataset = ISIC_HDF5_Dataset(
        df=train_df_sub, hdf5_path=train_file, transform=transform, is_labelled=True
    )

    valid_dataset = ISIC_HDF5_Dataset(
        df=valid_df_sub, hdf5_path=train_file, transform=transform, is_labelled=True
    )

    test_dataset = ISIC_HDF5_Dataset(
        df=test_df, hdf5_path=TEST_HDF5, transform=transform, is_labelled=False
    )

    return train_dataset, valid_dataset, test_dataset


def create_balanced_subset(
    df: pd.DataFrame, target_col="target", seed=42
) -> pd.DataFrame:
    # Just keep all the cancer cases and subsample the healthy cases (2:1 ratio)
    positives = df[df[target_col] == 1]

    n_negatives = len(positives) * 2  # 2:1 ratio
    negatives = df[df[target_col] == 0].sample(
        n=min(n_negatives, len(df[df[target_col] == 0])), random_state=seed
    )
    balanced_df = (
        pd.concat([positives, negatives])
        .sample(frac=1, random_state=seed)
        .reset_index(drop=True)
    )
    return balanced_df

In [None]:
train_meta_df, valid_meta_df, test_meta_df = load_metadata_dataset(
    is_subsampled=True, is_augmented=True
)

In [None]:
view_transform = T.Compose(
    [
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
train_hdf5_dataset, valid_hdf5_dataset, test_hdf5_dataset = load_hdf5_dataset(
    transform=view_transform, is_subsampled=True, is_augmented=True
)

In [None]:
BATCH_SIZE = 8  # batch size
NUM_SAMPLES = 500  # samples per epoch
NUM_WORKERS = 0

class_counts = train_meta_df["target"].value_counts().sort_index()
class_weights = 1.0 / class_counts

# Normalize weights to sum to 1
class_weights = class_weights / class_weights.sum()

sample_weights = train_meta_df["target"].map(class_weights).values

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=NUM_SAMPLES,
    replacement=True,
)

In [None]:
EPOCHS = 50
LEARNING_RATE = 1e-3
SCHEDULER_STEP_SIZE = 4
SCHEDULER_GAMMA = 0.5
MIN_DELTA = 0.001


def train_valid(model, train_loader, valid_loader, patience=5, is_multimodal=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)

    # Tracking lists
    train_aucs = []
    valid_aucs = []

    best_valid_auc = 0
    epochs_no_improve = 0

    for epoch in range(1, EPOCHS + 1):
        if is_multimodal:
            train_auc = train_multimodal(
                model, device, train_loader, optimizer, criterion, epoch
            )
            valid_auc = validate_multimodal(
                model, device, valid_loader, criterion, epoch
            )
        else:
            train_auc = train_singles(
                model, device, train_loader, optimizer, criterion, epoch
            )
            valid_auc = validate_singles(model, device, valid_loader, criterion, epoch)

        train_aucs.append(train_auc)
        valid_aucs.append(valid_auc)

        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
        print(f"Learning Rate: {current_lr}")

        # Early Stopping check with threshold
        if valid_auc > best_valid_auc + MIN_DELTA:
            best_valid_auc = valid_auc
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            print(
                f"No improvement in {epochs_no_improve} epochs (threshold of {MIN_DELTA})."
            )

        if epochs_no_improve >= patience:
            print(f"Early stopping triggered after {epoch} epochs!")
            break

    # Plot training and validation ROC AUC scores
    plot_train_valid_curves(train_aucs, valid_aucs)

    print("Training complete ✅")

    return epoch


def train_eval(
    model,
    full_loader,
    test_loader,
    early_stopping_epochs=EPOCHS,
    is_multimodal=False,
    output_model_file="model_final.pth",
    output_submission_file="submission.csv",
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = StepLR(optimizer, step_size=SCHEDULER_STEP_SIZE, gamma=SCHEDULER_GAMMA)

    # Tracking lists
    train_aucs = []

    for epoch in range(1, early_stopping_epochs + 1):
        if is_multimodal:
            train_auc = train_multimodal(
                model, device, full_loader, optimizer, criterion, epoch
            )
        else:
            train_auc = train_singles(
                model, device, full_loader, optimizer, criterion, epoch
            )
        train_aucs.append(train_auc)

        scheduler.step()
        current_lr = scheduler.get_last_lr()[0]
        print(f"Learning Rate: {current_lr}")

    # Save final model
    output_model_path = output_model_file
    torch.save(model.state_dict(), output_model_path)
    print(f"Model saved to {output_model_path}")

    # Plot training ROC AUC scores
    plot_train_curves(train_aucs)

    print("Training complete ✅")

    # Evaluate on test set
    if is_multimodal:
        submission_df = evaluate_multimodal(model, device, test_loader)
    else:
        submission_df = evaluate_singles(model, device, test_loader)

    # Save submission file
    submission_file_path = output_submission_file
    submission_df.to_csv(submission_file_path, index=False)

    print(
        f"Saved submission with {len(submission_df)} rows to {submission_file_path} ✅"
    )


def train_singles(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    running_loss = 0.0
    all_logits = []
    all_labels = []

    for singles, labels, _ in tqdm(
        train_loader, desc=f"Train Epoch {epoch}", leave=False
    ):
        singles, labels = singles.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(singles).view(-1)

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        all_logits.extend(torch.sigmoid(logits).detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_train_loss = running_loss / len(train_loader)
    try:
        train_auc = roc_auc_score(all_labels, all_logits)
    except ValueError:
        train_auc = 0.0  # In case only one class present

    print(
        f"Epoch {epoch}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Train ROC AUC: {train_auc:.4f}"
    )
    return train_auc


def validate_singles(model, device, valid_loader, criterion, epoch):
    model.eval()
    val_loss = 0.0
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for singles, labels, _ in tqdm(
            valid_loader, desc=f"Validation Epoch {epoch}", leave=False
        ):
            singles, labels = singles.to(device), labels.to(device)

            logits = model(singles).view(-1)
            loss = criterion(logits, labels)

            val_loss += loss.item()
            all_logits.extend(torch.sigmoid(logits).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(valid_loader)
    try:
        val_auc = roc_auc_score(all_labels, all_logits)
    except ValueError:
        val_auc = 0.0

    print(
        f"Epoch {epoch}/{EPOCHS} | Validation Loss: {avg_val_loss:.4f} | Validation ROC AUC: {val_auc:.4f}"
    )
    return val_auc


def evaluate_singles(model, device, test_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for singles, isic_ids in tqdm(test_loader, desc="Inference on Test"):
            singles = singles.to(device)

            logits = model(singles).view(-1)
            probs = torch.sigmoid(logits).cpu().numpy()

            for isic_id, p in zip(isic_ids, probs):
                predictions.append({"isic_id": isic_id, "target": float(p)})

    submission_df = pd.DataFrame(predictions)
    submission_df = submission_df.sort_values(by="isic_id").reset_index(drop=True)

    return submission_df


def train_multimodal(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    running_loss = 0.0
    all_logits = []
    all_labels = []

    for metadatas, images, labels in tqdm(
        train_loader, desc=f"Train Epoch {epoch}", leave=False
    ):
        metadatas, images, labels = (
            metadatas.to(device).float(),
            images.to(device),
            labels.to(device),
        )

        optimizer.zero_grad()
        logits = model(images, metadatas).view(-1)  # [batch_size]

        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        all_logits.extend(torch.sigmoid(logits).detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    avg_train_loss = running_loss / len(train_loader)
    try:
        train_auc = roc_auc_score(all_labels, all_logits)
    except ValueError:
        train_auc = 0.0

    print(
        f"Epoch {epoch}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Train ROC AUC: {train_auc:.4f}"
    )
    return train_auc


def validate_multimodal(model, device, valid_loader, criterion, epoch):
    model.eval()
    val_loss = 0.0
    all_logits = []
    all_labels = []

    with torch.no_grad():
        for metadatas, images, labels in tqdm(
            valid_loader, desc=f"Validation Epoch {epoch}", leave=False
        ):
            metadatas, images, labels = (
                metadatas.to(device).float(),
                images.to(device),
                labels.to(device),
            )

            logits = model(images, metadatas).view(-1)
            loss = criterion(logits, labels)

            val_loss += loss.item()
            all_logits.extend(torch.sigmoid(logits).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(valid_loader)
    try:
        val_auc = roc_auc_score(all_labels, all_logits)
    except ValueError:
        val_auc = 0.0

    print(
        f"Epoch {epoch}/{EPOCHS} | Validation Loss: {avg_val_loss:.4f} | Validation ROC AUC: {val_auc:.4f}"
    )
    return val_auc


def evaluate_multimodal(model, device, test_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for metadatas, images, isic_ids in tqdm(test_loader, desc="Inference on Test"):
            metadatas, images = metadatas.to(device).float(), images.to(device)

            logits = model(images, metadatas).view(-1)
            probs = torch.sigmoid(logits).cpu().numpy()

            for isic_id, p in zip(isic_ids, probs):
                predictions.append({"isic_id": isic_id, "target": float(p)})

    submission_df = pd.DataFrame(predictions)
    submission_df = submission_df.sort_values(by="isic_id").reset_index(drop=True)

    return submission_df


def plot_train_valid_curves(train_aucs, val_aucs):
    # Prepare DataFrame for seaborn
    epochs = list(range(1, len(train_aucs) + 1))
    data = pd.DataFrame(
        {
            "Epoch": epochs * 2,
            "ROC AUC": train_aucs + val_aucs,
            "Phase": ["Train"] * len(train_aucs) + ["Validation"] * len(val_aucs),
        }
    )

    # Plot with seaborn
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=data, x="Epoch", y="ROC AUC", hue="Phase", marker="o")
    plt.title("Training vs Validation ROC AUC")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()


def plot_train_curves(train_aucs):
    # Prepare DataFrame for seaborn
    epochs = list(range(1, len(train_aucs) + 1))
    data = pd.DataFrame(
        {
            "Epoch": epochs,
            "ROC AUC": train_aucs,
        }
    )

    # Plot with seaborn
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=data, x="Epoch", y="ROC AUC", marker="o")
    plt.title("Training ROC AUC (Full Dataset)")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()

# Only Images


In [None]:
train_hdf5_loader = DataLoader(
    train_hdf5_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    num_workers=NUM_WORKERS,
)

valid_hdf5_loader = DataLoader(
    valid_hdf5_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
)

test_hdf5_loader = DataLoader(
    test_hdf5_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
)

full_hdf5_dataset = ConcatDataset([train_hdf5_dataset, valid_hdf5_dataset])
full_hdf5_loader = DataLoader(
    full_hdf5_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
)

print(
    f"Train loader: {len(train_hdf5_loader)} batches (total = {NUM_SAMPLES} samples / {BATCH_SIZE} batches)"
)
print(f"Valid loader: {len(valid_hdf5_loader)} batches")
print(f"Test loader:  {len(test_hdf5_loader)} batches")
print(f"Full loader:  {len(full_hdf5_loader)} batches")

In [None]:
class ResNet50_Simple_ImageOnly(nn.Module):
    def __init__(self, out_features=1):
        super(ResNet50_Simple_ImageOnly, self).__init__()
        self.resnet = models.resnet50(pretrained=True)

        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, out_features)

    def forward(self, x):
        return self.resnet(x)

In [None]:
model = ResNet50_Simple_ImageOnly()

stopping_epochs = train_valid(
    model, train_hdf5_loader, valid_hdf5_loader, patience=5, is_multimodal=False
)

In [None]:
final_model = ResNet50_Simple_ImageOnly()

train_eval(
    final_model,
    full_hdf5_loader,
    test_hdf5_loader,
    early_stopping_epochs=stopping_epochs,
    is_multimodal=False,
    output_model_file="/kaggle/working/bench_hdf5.pth",
    output_submission_file="/kaggle/working/bench_hdf5.csv",
)