In [119]:
# !pip install torch torchvision efficientnet-pytorch

# In this notebook I create two models:
- A tabular only model that is intended to be used to predict target based on only user input and the features generated by my image_feature_extractor.py program (uses OpenCV to extract geometric and color features from image directly).
- A hybrid model that combines my tabular model with a CNN by concatinating CNN feature embeddings with tabular data to retrain the tabular model.

In [232]:
import os
import itertools
from pathlib import Path
import time
from tqdm import tqdm
import h5py
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
import joblib
import pandas as pd
import polars as pl
import pickle
import json
from io import BytesIO
import timm
from torchvision import transforms
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.utils.class_weight import compute_sample_weight
from torch.utils.data.sampler import WeightedRandomSampler
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

In [233]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

# Model 1 - Tabular only model

In [234]:
user_input_features_num = [
    # User Input Features
    'age_approx',                  # Ask user for age
    # 'sex',                         # Ask user for sex
    # 'anatom_site_general',         # Ask user to put in anatomical site (from list of options)
    'clin_size_long_diam_mm',      # Ask user for size of lesion in mm
    #'tbp_lv_location',	           # Classification of anatomical location, divides arms & legs to upper & lower; torso into thirds.+
    #'tbp_lv_location_simple',	   # Classification of anatomical location, simple.+
]

cv_features = [
    # Shape
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+

    'tbp_lv_minorAxisMM',	             # Smallest lesion diameter (mm).+ AXIS OF LEAST SECOND MOMENT!
    'tbp_lv_eccentricity',               # Eccentricity. (use min_inertia line and max_inertia line)
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    
    # Color Features
    'tbp_lv_A',	                         # A inside lesion.+
    'tbp_lv_Aext',	                     # A outside lesion.+
    'tbp_lv_B',                          # B inside lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    
    'tbp_lv_deltaA',	           # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',               # Color difference
    'tbp_lv_deltaL',	           # Average L contrast (inside vs. outside lesion).+
    # 'tbp_lv_stdL',	               # Standard deviation of L inside lesion.+
    # 'tbp_lv_stdLExt',	           # Standard deviation of L outside lesion.+
    'tbp_lv_deltaLBnorm',          # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    # 'tbp_lv_color_std_mean',	   # Color irregularity, calculated as the variance of colors within the lesion's boundary.

    # Harder to implement
    # 'tbp_lv_radial_color_std_max',	     # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    # 'tbp_lv_symm_2axis',	             # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    # 'tbp_lv_symm_2axis_angle',	         # Lesion border asymmetry angle.+
    # 'tbp_lv_norm_border',	       # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    # 'tbp_lv_norm_color',	       # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+

    
]

derived_features = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    # 'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    # 'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    # 'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm
    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3
    # 'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    # 'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)

    # # Harder to implement
    # 'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    # 'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    # 'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max
    # 'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    # 'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    # 'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    # 'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    # 'border_color_interaction_2',
    # 'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    # 'shape_complexity_index',        # border_complexity       + lesion_shape_index
    # 'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    # 'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    # 'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)
    # 'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    # 'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max
]

# Categoric USER INPUT
cat_cols = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']

new_feature_cols = user_input_features_num + cat_cols + cv_features + derived_features

In [235]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()),
        )
        .with_columns(
            # Basic derived features we can calculate
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
        )
        .with_columns(
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            # color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            # consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            # hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
        )
        .with_columns(
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            # color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            # shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
        ) # HARDER ONES AHEAD:
        # .with_columns(
        #     age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
        #     index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        #     color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        #     color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
        #     symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
        #     comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
        #     border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
        #     border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
        #     lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
        #     shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
        #     lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
        #     symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
        #     consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        #     border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
        #     color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        # )
        .to_pandas()
        .set_index(id_col)
    )

In [236]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'n_jobs':           2,
    'boosting_type':    'gbdt',
    'lambda_l1':        0.03335206514282942, 
    'lambda_l2':        0.005157393323802471, 
    'learning_rate':    0.030665870185795318, 
    'max_depth':        7, 
    'num_leaves':       239, 
    'colsample_bytree': 0.7573175155547233, 
    'colsample_bynode': 0.5005423904042993, 
    'bagging_fraction': 0.7937347683420382, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 29, 
    'scale_pos_weight': 1.648349898918236,
}


estimator = VotingClassifier([
    ('lgb1', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=12)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=12)),
    ])),
    ('lgb2', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=22)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=22)),
    ])),
    ('lgb3', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=32)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=32)),
    ])),
    ('lgb4', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=42)),
    ])),
    ('lgb5', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=52)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=52)),
    ])),
], voting='soft')

In [237]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [238]:
def preprocess(df_train, df_test, cat_cols, new_feature_cols):
    # Cast original categorical columns
    for col in cat_cols:
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]
    
    # Directly assign the transformed arrays
    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')
    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    # Update feature columns
    updated_feature_cols = [col for col in new_feature_cols if col not in cat_cols]
    updated_feature_cols.extend(new_cat_cols)

    joblib.dump(encoder, 'encoder.pkl')
    with open('feature_columns.json', 'w') as f:
        json.dump({
            "cat_cols": new_cat_cols,
            "new_feature_cols": updated_feature_cols
        }, f)
        
    return df_train, df_test, new_cat_cols, updated_feature_cols

In [239]:
def custom_cross_val(estimator, X, y, cv, groups):
    importances = []
    scores = []
    splits = list(cv.split(X, y, groups))
    
    # Progress bar for folds
    fold_iterator = tqdm(enumerate(splits), 
                        total=5, 
                        desc="Folds",
                        position=0)
    
    # for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    for fold_idx, (train_idx, val_idx) in fold_iterator:
        fold_start_time = time.time()
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit the model
        estimator.fit(X_train, y_train)
        
        # Get scores
        score = custom_metric(estimator, X_val, y_val)
        scores.append(score)
        
        # Extract and store feature importances from each LightGBM model
        fold_importance = np.zeros(len(new_feature_cols))
        
        # for name, pipeline in estimator.named_estimators_.items():
        for name, pipeline in estimator.named_estimators_.items(): 
            lgb_model = pipeline.named_steps['classifier']
            fold_importance += lgb_model.feature_importances_
        
        importances.append(fold_importance / len(estimator.named_estimators_))
        
        fold_time = time.time() - fold_start_time
        print(f"Fold {fold_idx + 1} completed in {fold_time:.2f} seconds")
        
        # Update fold progress bar with timing info
        fold_iterator.set_postfix({'Score': f'{score:.4f}'})
    
    print('\n')
    return np.array(scores), np.array(importances)

In [240]:
df_train = read_data(train_path)
df_competition_test = read_data(test_path)
df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_competition_test, cat_cols, new_feature_cols = preprocess(df_train, df_competition_test, cat_cols, new_feature_cols)

# Create holdout and continue as before
df_positive = df_train[df_train['target'] == 1]
df_negative = df_train[df_train['target'] == 0]

test_pos_size = int(0.1 * len(df_positive))
df_test_positive = df_positive.sample(n=test_pos_size, random_state=42)
df_train_positive = df_positive.drop(df_test_positive.index)

df_test_negative = df_negative.sample(n=test_pos_size, random_state=42)
df_train_negative = df_negative.drop(df_test_negative.index)

df_test = pd.concat([df_test_positive, df_test_negative], ignore_index=True).sample(frac=1, random_state=42)
df_train = pd.concat([df_train_positive, df_train_negative], ignore_index=True).sample(frac=1, random_state=42)

X = df_train[new_feature_cols]
y = df_train[target_col]
groups = df_train[group_col]
cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)

Unnamed: 0,age_approx,clin_size_long_diam_mm,tbp_lv_areaMM2,tbp_lv_perimeterMM,tbp_lv_minorAxisMM,tbp_lv_eccentricity,tbp_lv_area_perim_ratio,tbp_lv_A,tbp_lv_Aext,tbp_lv_B,...,onehot_28,onehot_29,onehot_30,onehot_31,onehot_32,onehot_33,onehot_34,onehot_35,onehot_36,onehot_37
17847,80.0,2.74,4.372302,9.420487,2.191781,0.680063,20.297220,17.320270,14.017710,29.256130,...,1,0,0,0,0,0,0,0,1,0
209585,60.0,9.13,27.453556,23.872196,3.736990,0.923051,20.758029,21.628310,10.168158,37.833219,...,0,0,0,0,0,0,0,0,1,0
202064,75.0,4.25,7.975230,12.500660,2.328767,0.812933,19.593990,18.658840,17.396250,24.560550,...,0,0,0,0,0,1,0,0,0,0
171342,65.0,2.59,4.053293,8.999792,2.199336,0.642109,19.982828,20.323963,17.244356,28.085807,...,1,0,0,0,0,0,0,0,1,0
344250,50.0,3.10,4.559955,8.952786,1.937279,0.803514,17.577450,16.198720,14.167720,25.286010,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,55.0,3.29,7.093263,10.095683,2.465753,0.597408,14.368959,18.903081,13.470260,24.052698,...,0,0,0,0,0,0,1,0,0,0
365838,55.0,2.60,4.222181,7.870664,2.054795,0.603512,14.671885,16.891008,14.745181,16.769671,...,0,0,0,0,0,1,0,0,0,0
131932,55.0,3.87,9.176206,11.291290,2.976213,0.637141,13.893890,20.056750,12.951640,32.592500,...,0,0,0,0,0,0,0,1,0,0
146867,80.0,4.87,8.425596,14.017250,2.617907,0.859545,23.319800,24.251860,21.976400,31.738420,...,0,0,0,0,1,0,0,0,0,0


In [169]:
# Run cross validation with timing
print("Starting cross-validation...")
total_start_time = time.time()

# Run cross validation
scores, all_fold_importances = custom_cross_val(
    estimator=estimator,
    X=X, 
    y=y,
    cv=cv,
    groups=groups
)

total_time = time.time() - total_start_time
# Print timing results
print(f"\nTraining Complete!")
print(f"Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

# Print scores (equivalent to your original output)
print(f"\nModel Performance:")
print(f"Mean score: {np.mean(scores):.4f}")
print(f"Score std: {np.std(scores):.4f}")
print(f"All scores: {scores}")

# Print feature importances
mean_importances = all_fold_importances.mean(axis=0)
importance_df = pd.DataFrame({
    'feature': new_feature_cols,
    'importance': mean_importances
}).sort_values('importance', ascending=False)

print("\nTop 50 Most Important Features:")
print(importance_df.head(50))

Starting cross-validation...


Folds:  20%|██        | 1/5 [00:10<00:42, 10.73s/it, Score=0.1535]

Fold 1 completed in 10.72 seconds


Folds:  40%|████      | 2/5 [00:21<00:31, 10.52s/it, Score=0.1524]

Fold 2 completed in 10.38 seconds


Folds:  60%|██████    | 3/5 [00:32<00:22, 11.00s/it, Score=0.1688]

Fold 3 completed in 11.58 seconds


Folds:  80%|████████  | 4/5 [00:44<00:11, 11.30s/it, Score=0.1695]

Fold 4 completed in 11.75 seconds


Folds: 100%|██████████| 5/5 [00:56<00:00, 11.23s/it, Score=0.1452]

Fold 5 completed in 11.71 seconds



Training Complete!
Total time: 57.37 seconds (0.96 minutes)

Model Performance:
Mean score: 0.1579
Score std: 0.0096
All scores: [0.15354879 0.152416   0.16877795 0.1694917  0.14521481]

Top 50 Most Important Features:
                      feature  importance
18              tbp_lv_deltaB      273.96
13                   tbp_lv_H      263.08
8                 tbp_lv_Aext      259.92
34   overall_color_difference      254.00
21          lesion_size_ratio      245.72
29       color_contrast_index      245.60
15                   tbp_lv_L      240.80
16                tbp_lv_Lext      239.16
1      clin_size_long_diam_mm      238.16
5         tbp_lv_eccentricity      235.68
23               hue_contrast      233.68
20         tbp_lv_deltaLBnorm      229.56
14                tbp_lv_Hext      218.80
9                    tbp_lv_B      213.32
32        mean_hue_difference      213.20
7                    tbp_lv_A      210.08
17              tbp_lv_deltaA 




### Training Final Model (Tabular only)

In [111]:
# X, y = df_train[feature_cols], df_train[target_col]
X, y = df_train[new_feature_cols], df_train[target_col]

estimator.fit(X, y)

### Save model using Pickle

In [112]:
with open('model.pkl', 'wb') as file:
    pickle.dump(estimator, file)

In [113]:
# # Filter the data point after preprocessing
# data_point = df_train.loc[["ISIC_0082829"]]

# # Extract features
# X_point = data_point[new_feature_cols]




# # Get the probability of the target
# probability = estimator.predict_proba(X_point)

# # Extract probability for the positive class
# positive_class_prob = probability[0][1]

# print(f"Data point features ({X_point.shape[1]}): {X_point.columns.tolist()}")

# print(f"Probability of target being 1: {positive_class_prob}")

# Hybrid model

In [129]:
train_h5 = root / 'train-image.hdf5'
test_h5 = root / 'test-image.hdf5'

In [149]:
# Configuration for CNN
CNN_CONFIG = {
    "img_size": 224,
    "batch_size": 32,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "pretrained_model": "efficientnet_b0",
    "embedding_size": 1792
}

def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"GPU Memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
        print(f"GPU Memory cached: {torch.cuda.memory_reserved()/1024**2:.1f}MB")

def calculate_pauc(y_true, y_pred, min_tpr=0.80):
    """Calculate partial AUC score matching your custom metric"""
    max_fpr = abs(1 - min_tpr)
    
    # Convert to numpy arrays if they're tensors
    if torch.is_tensor(y_true):
        y_true = y_true.cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.cpu().numpy()
    
    # Ensure y_true is flattened
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    
    # Match your custom metric calculation
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

# Define Image Dataset
class ISICImageDatasetHDF5(Dataset):
    def __init__(self, h5_path, df, transforms=None):
        self.h5_path = h5_path
        self.df = df
        self.transforms = transforms
        self.isic_ids = df['isic_id'].values
        self.targets = df['target'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        isic_id = self.isic_ids[idx]
        with h5py.File(self.h5_path, "r") as h5_file:
            img_data = h5_file[isic_id][()]
        img = Image.open(BytesIO(img_data))
        img = np.array(img)
    
        if self.transforms:
            augmented = self.transforms(image=img)
            img = augmented["image"]
    
        target = torch.tensor(self.targets[idx], dtype=torch.float32)
        return img, target

# Load Pretrained CNN
class EmbeddingModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = timm.create_model(model_name, pretrained=True, num_classes=0)
        self.embedding_size = self.base_model.num_features
        self.classifier = nn.Linear(self.embedding_size, 1)

    def forward(self, x):
        embeddings = self.base_model(x)
        logits = self.classifier(embeddings)
        return embeddings, logits

    def get_embeddings(self, x):
        """Helper method to get only embeddings during inference"""
        embeddings, _ = self.forward(x)
        return embeddings

def train_embedding_model_with_hdf5(df_train, h5_path, model, output_path):
    sample_weights = compute_class_weights(df_train, target_col='target')
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)

    transforms_fn = A.Compose([
        A.Resize(height=CNN_CONFIG["img_size"], width=CNN_CONFIG["img_size"]),
        A.HorizontalFlip(),
        A.VerticalFlip(),
        A.Rotate(limit=180),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])

    train_dataset = ISICImageDatasetHDF5(h5_path, df_train, transforms=transforms_fn)
    train_loader = DataLoader(train_dataset, batch_size=CNN_CONFIG["batch_size"], 
                            sampler=sampler, num_workers=4, pin_memory=True)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    model.to(CNN_CONFIG["device"])

    print("Starting CNN embedding model training...")
    model.train()
    num_epochs = 5
    
    # Lists to store metrics
    train_losses = []
    pauc_scores = []
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        all_targets = []
        all_predictions = []
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
        for imgs, targets in progress_bar:
            imgs, targets = imgs.to(CNN_CONFIG["device"]), targets.to(CNN_CONFIG["device"])
            optimizer.zero_grad()
            
            # Get embeddings and logits
            embeddings, logits = model(imgs)
            
            # Calculate BCE loss
            loss = criterion(logits, targets.unsqueeze(1))
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            # Store loss
            epoch_loss += loss.item()
            
            # Store predictions and targets for pAUC calculation
            predictions = torch.sigmoid(logits)
            all_targets.extend(targets.cpu().numpy())
            all_predictions.extend(predictions.detach().cpu().numpy())
            
            # Update progress bar
            progress_bar.set_postfix({'Loss': f'{epoch_loss / (progress_bar.n + 1):.4f}'})
        
        # Calculate epoch metrics
        avg_loss = epoch_loss / len(train_loader)
        pauc_score = calculate_pauc(np.array(all_targets), np.array(all_predictions))
        
        # Store metrics
        train_losses.append(avg_loss)
        pauc_scores.append(pauc_score)
        
        print(f"Epoch {epoch + 1} Loss: {avg_loss:.4f}, pAUC: {pauc_score:.4f}")
        print_gpu_memory()
    
    print("\nTraining Summary:")
    print(f"Final Loss: {train_losses[-1]:.4f}")
    print(f"Final pAUC: {pauc_scores[-1]:.4f}")
    print("CNN embedding model training completed.")
    
    torch.save({
        'model_state_dict': model.state_dict(),
        # 'train_losses': train_losses,
        # 'pauc_scores': pauc_scores,
    }, output_path)
    print(f"Model and training metrics saved to {output_path}")

def generate_cnn_embeddings_hdf5(df, h5_path, model, output_path):
    print("Generating CNN embeddings...")
    model.eval()
    transforms_fn = A.Compose([
        A.Resize(height=CNN_CONFIG["img_size"], width=CNN_CONFIG["img_size"]),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ])
    
    dataset = ISICImageDatasetHDF5(h5_path, df, transforms=transforms_fn)
    dataloader = DataLoader(dataset, batch_size=CNN_CONFIG["batch_size"], 
                          shuffle=False, num_workers=4, pin_memory=True)

    embeddings = []
    with torch.no_grad():
        for imgs, _ in tqdm(dataloader, desc="Generating Embeddings"):
            imgs = imgs.to(CNN_CONFIG["device"])
            features = model.get_embeddings(imgs).cpu().numpy()
            embeddings.append(features)

    embeddings = np.vstack(embeddings)
    embedding_df = pd.DataFrame(embeddings, index=df['isic_id'])
    embedding_df.to_csv(output_path)
    print("CNN embeddings generated.")
    print(f"Embeddings saved to {output_path}")

def compute_class_weights(df, target_col):
    class_counts = df[target_col].value_counts().to_dict()
    total_samples = len(df)
    weights = {cls: total_samples / count for cls, count in class_counts.items()}
    sample_weights = df[target_col].map(weights).values
    return sample_weights

In [None]:
if __name__ == "__main__":
    print("Loading training data...")
    
    RANDOM_SEED = 42
    train_h5 = root / 'train-image.hdf5'
    cnn_model_path = 'cnn_model_hdf5_balanced.pth'

    cat_cols = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
    
    df_train = read_data(train_path)
    df_competition_test = read_data(test_path)
    df_subm = pd.read_csv(subm_path, index_col=id_col)

    df_train = df_train.reset_index()  # Ensures isic_id is now a column
    df_competition_test = df_competition_test.reset_index()
    
    # Preprocess now returns df_train, df_competition_test, cat_cols, new_feature_cols
    df_train, df_competition_test, cat_cols, new_feature_cols = preprocess(df_train, df_competition_test, cat_cols, new_feature_cols)

    # Create a holdout test set
    df_positive = df_train[df_train['target'] == 1]
    df_negative = df_train[df_train['target'] == 0]

    test_pos_size = int(0.1 * len(df_positive))
    df_test_positive = df_positive.sample(n=test_pos_size, random_state=RANDOM_SEED)
    df_train_positive = df_positive.drop(df_test_positive.index)

    df_test_negative = df_negative.sample(n=test_pos_size, random_state=RANDOM_SEED)
    df_train_negative = df_negative.drop(df_test_negative.index)

    # df_test = pd.concat([df_test_positive, df_test_negative], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED)
    # df_train = pd.concat([df_train_positive, df_train_negative], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED)
    df_test = pd.concat([df_test_positive, df_test_negative]).sample(frac=1, random_state=RANDOM_SEED)
    df_train = pd.concat([df_train_positive, df_train_negative]).sample(frac=1, random_state=RANDOM_SEED)

    df_test = df_test.reset_index().rename(columns={'index': 'isic_id'})

    # Create balanced subset for CNN training
    total_size = 10000
    df_positive = df_train[df_train['target'] == 1]
    df_negative = df_train[df_train['target'] == 0]
    pos_count = len(df_positive)
    neg_count = total_size - pos_count

    df_negative_sampled = df_negative.sample(n=neg_count, random_state=RANDOM_SEED)
    df_small = pd.concat([df_positive, df_negative_sampled], axis=0).sample(frac=1, random_state=RANDOM_SEED)

    # df_small_for_cnn = df_small.copy()
    df_small_for_cnn = df_small.reset_index()
    # df_small = df_small.set_index('isic_id')

    print("Initializing CNN embedding model...")
    embedding_model = EmbeddingModel(CNN_CONFIG["pretrained_model"])

    train_embedding_model_with_hdf5(df_small_for_cnn, train_h5, embedding_model, cnn_model_path)

In [None]:
print(df_test.columns)  
# should show 'isic_id' as a column now

# Use embeddings to retrain model

In [199]:
def custom_cross_val(estimator, X, y, cv, groups):
    importances = []
    scores = []
    splits = list(cv.split(X, y, groups))
    
    # Progress bar for folds
    fold_iterator = tqdm(enumerate(splits), 
                        total=5, 
                        desc="Folds",
                        position=0)
    
    for fold_idx, (train_idx, val_idx) in fold_iterator:
        fold_start_time = time.time()
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit the model
        estimator.fit(X_train, y_train)
        
        # Get scores
        score = custom_metric(estimator, X_val, y_val)
        scores.append(score)
        
        # Extract and store feature importances from each LightGBM model
        # Initialize with correct shape
        fold_importance = np.zeros(X.shape[1])
        
        for name, pipeline in estimator.named_estimators_.items(): 
            lgb_model = pipeline.named_steps['classifier']
            # Make sure feature importances match the number of features
            if len(lgb_model.feature_importances_) != X.shape[1]:
                continue  # Skip if shapes don't match
            fold_importance += lgb_model.feature_importances_
        
        importances.append(fold_importance / len(estimator.named_estimators_))
        
        fold_time = time.time() - fold_start_time
        print(f"Fold {fold_idx + 1} completed in {fold_time:.2f} seconds")
        
        # Update fold progress bar with timing info
        fold_iterator.set_postfix({'Score': f'{score:.4f}'})
    
    print('\n')
    return np.array(scores), np.array(importances)

In [209]:
print("\nBefore embedding generation:")
print("Sample of isic_ids:", df_small_for_cnn['isic_id'].head())
print("Sample of df_small index:", df_small.index[:5])

# Generate embeddings using df_small_for_cnn
print("\nGenerating embeddings...")
generate_cnn_embeddings_hdf5(df_small_for_cnn, train_h5, embedding_model, "train_embeddings.csv")


Before embedding generation:
Sample of isic_ids: 0    ISIC_3671934
1    ISIC_9529247
2    ISIC_4551312
3    ISIC_5323671
4    ISIC_2563670
Name: isic_id, dtype: object
Sample of df_small index: Index(['ISIC_3671934', 'ISIC_9529247', 'ISIC_4551312', 'ISIC_5323671',
       'ISIC_2563670'],
      dtype='object', name='isic_id')

Generating embeddings...
Generating CNN embeddings...


Generating Embeddings: 100%|██████████| 313/313 [02:57<00:00,  1.76it/s]


CNN embeddings generated.
Embeddings saved to train_embeddings.csv


In [211]:
# Load embeddings
embedding_df = pd.read_csv("train_embeddings.csv", index_col='isic_id')

# Verify matching
print("\nVerification after embedding generation:")
print("Sample of embedding_df index:", embedding_df.index[:5])
print(f"df_small shape: {df_small.shape}")
print(f"embedding_df shape: {embedding_df.shape}")
print(f"Number of overlapping indices: {len(set(df_small.index).intersection(set(embedding_df.index)))}")

for col in df_small.select_dtypes(include=['category']).columns:
    df_small[col] = df_small[col].cat.codes

# Debug column names
print("\nChecking for duplicate columns:")
print("Feature columns:", len(new_feature_cols), len(set(new_feature_cols)))
duplicates = [x for x in new_feature_cols if new_feature_cols.count(x) > 1]
print("Duplicate features:", duplicates)

unique_feature_cols = list(dict.fromkeys(new_feature_cols))

# Make sure all columns are unique before concatenation
X_with_embeddings = pd.concat([
    df_small[new_feature_cols].loc[:,~df_small[new_feature_cols].columns.duplicated()],  # Remove duplicates
    embedding_df
], axis=1)

# Update feature columns list to include embeddings
new_feature_cols_with_emb = unique_feature_cols + [f'emb_{i}' for i in range(embedding_df.shape[1])]


print("Final column check:")
print("Total columns:", X_with_embeddings.shape[1])
print("Unique columns:", len(X_with_embeddings.columns.unique()))

y = df_small[target_col]
groups = df_small[group_col]

print("\nFinal dataset shapes:")
print(f"X_with_embeddings shape: {X_with_embeddings.shape}")
print(f"y shape: {y.shape}")
print(f"groups shape: {groups.shape}")

# # Convert categorical columns to string type
# categorical_columns = X_with_embeddings.select_dtypes(include=['category']).columns
# for col in categorical_columns:
#     X_with_embeddings[col] = X_with_embeddings[col].astype(str)



if len(X_with_embeddings) > 0:
    print("\nProceeding with model training...")

    # Modified LightGBM parameters
    lgb_params_updated = lgb_params.copy()
    lgb_params_updated.update({
        'categorical_feature': None,
        'verbose': -1
    })
    
    # Create CV splitter
    cv = StratifiedGroupKFold(5, shuffle=True, random_state=seed)
    
    # Create estimator for balanced data
    estimator_balanced = VotingClassifier([
        ('lgb1', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=1, random_state=12)),
            ('classifier', lgb.LGBMClassifier(**lgb_params_updated, random_state=12)),
        ])),
        ('lgb2', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=1, random_state=22)),
            ('classifier', lgb.LGBMClassifier(**lgb_params_updated, random_state=22)),
        ])),
        ('lgb3', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=1, random_state=32)),
            ('classifier', lgb.LGBMClassifier(**lgb_params_updated, random_state=32)),
        ])),
        ('lgb4', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=1, random_state=42)),
            ('classifier', lgb.LGBMClassifier(**lgb_params_updated, random_state=42)),
        ])),
        ('lgb5', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=1, random_state=52)),
            ('classifier', lgb.LGBMClassifier(**lgb_params_updated, random_state=52)),
        ])),
    ], voting='soft')
    
    # Run cross-validation
    print("\nStarting cross-validation with embeddings...")
    scores, all_fold_importances = custom_cross_val(
        estimator=estimator_balanced,
        X=X_with_embeddings, 
        y=y,
        cv=cv,
        groups=groups
    )
    
    # Print results
    print("\nTraining Complete with Embeddings!")
    print(f"Mean score: {np.mean(scores):.4f}")
    print(f"Score std: {np.std(scores):.4f}")
    print(f"All scores: {scores}")
    
    # Compute feature importance
    mean_importances = all_fold_importances.mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': new_feature_cols_with_emb,
        'importance': mean_importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 50 Most Important Features (with Embeddings):")
    print(importance_df.head(50))
    
    # Train final model
    estimator_balanced.fit(X_with_embeddings, y)
    
    # Save model
    with open('hybrid_model.pkl', 'wb') as file:
        pickle.dump(estimator_balanced, file)

else:
    raise ValueError("No data after joining features with embeddings. Check index alignment.")


Verification after embedding generation:
Sample of embedding_df index: Index(['ISIC_3671934', 'ISIC_9529247', 'ISIC_4551312', 'ISIC_5323671',
       'ISIC_2563670'],
      dtype='object', name='isic_id')
df_small shape: (10000, 109)
embedding_df shape: (10000, 1280)
Number of overlapping indices: 10000

Checking for duplicate columns:
Feature columns: 228 76
Duplicate features: ['onehot_0', 'onehot_1', 'onehot_2', 'onehot_3', 'onehot_4', 'onehot_5', 'onehot_6', 'onehot_7', 'onehot_8', 'onehot_9', 'onehot_10', 'onehot_11', 'onehot_12', 'onehot_13', 'onehot_14', 'onehot_15', 'onehot_16', 'onehot_17', 'onehot_18', 'onehot_19', 'onehot_20', 'onehot_21', 'onehot_22', 'onehot_23', 'onehot_24', 'onehot_25', 'onehot_26', 'onehot_27', 'onehot_28', 'onehot_29', 'onehot_30', 'onehot_31', 'onehot_32', 'onehot_33', 'onehot_34', 'onehot_35', 'onehot_36', 'onehot_37', 'onehot_0', 'onehot_1', 'onehot_2', 'onehot_3', 'onehot_4', 'onehot_5', 'onehot_6', 'onehot_7', 'onehot_8', 'onehot_9', 'onehot_10', 

Folds:  20%|██        | 1/5 [00:08<00:33,  8.42s/it, Score=0.1998]

Fold 1 completed in 8.42 seconds


Folds:  40%|████      | 2/5 [00:17<00:25,  8.51s/it, Score=0.1994]

Fold 2 completed in 8.58 seconds


Folds:  60%|██████    | 3/5 [00:26<00:17,  8.81s/it, Score=0.1996]

Fold 3 completed in 9.17 seconds


Folds:  80%|████████  | 4/5 [00:35<00:09,  9.04s/it, Score=0.1994]

Fold 4 completed in 9.37 seconds


Folds: 100%|██████████| 5/5 [00:44<00:00,  8.95s/it, Score=0.1995]

Fold 5 completed in 9.20 seconds



Training Complete with Embeddings!
Mean score: 0.1996
Score std: 0.0001
All scores: [0.19981785 0.1994102  0.19962518 0.19942158 0.19954706]

Top 50 Most Important Features (with Embeddings):
       feature  importance
887    emb_811       14.12
644    emb_568       12.04
310    emb_234       11.12
124     emb_48       10.88
853    emb_777       10.40
143     emb_67        9.52
1128  emb_1052        8.24
408    emb_332        7.60
1119  emb_1043        7.60
364    emb_288        7.44
610    emb_534        7.28
1193  emb_1117        6.80
1108  emb_1032        6.80
844    emb_768        6.08
446    emb_370        5.52
755    emb_679        5.40
1283  emb_1207        5.36
475    emb_399        5.32
885    emb_809        5.08
289    emb_213        5.04
1223  emb_1147        5.00
612    emb_536        4.92
288    emb_212        4.68
1292  emb_1216        4.24
697    emb_621        4.24
1135  emb_1059        4.04
1163  emb_1087        3.88
655    emb_579  




# Compare two models on holdout test set

In [241]:
df_test

Unnamed: 0,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,onehot_28,onehot_29,onehot_30,onehot_31,onehot_32,onehot_33,onehot_34,onehot_35,onehot_36,onehot_37
33,1,IP_2945977,55.0,female,anterior torso,7.48,TBP tile: close-up,3D: XP,25.711526,17.473305,...,1,0,0,0,0,0,0,0,1,0
0,1,IP_2456971,60.0,male,head/neck,2.70,TBP tile: close-up,3D: white,26.867286,20.574389,...,0,0,1,0,0,0,0,0,0,0
34,1,IP_3028432,65.0,male,lower extremity,5.01,TBP tile: close-up,3D: white,20.284447,15.174651,...,0,0,0,0,1,0,0,0,0,0
12,1,IP_7797815,75.0,male,upper extremity,1.23,TBP tile: close-up,3D: white,11.869290,8.123813,...,0,0,0,0,0,1,0,0,0,0
10,1,IP_7411721,45.0,male,posterior torso,5.62,TBP tile: close-up,3D: XP,23.281400,15.250300,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,1,IP_4513696,45.0,female,posterior torso,1.23,TBP tile: close-up,3D: XP,16.429892,13.567763,...,0,0,0,0,0,0,0,1,0,0
60,0,IP_2945977,55.0,female,posterior torso,4.17,TBP tile: close-up,3D: XP,18.997522,14.310354,...,0,0,0,0,0,0,0,1,0,0
71,0,IP_4304202,55.0,male,posterior torso,2.32,TBP tile: close-up,3D: XP,23.344960,16.223478,...,0,0,0,0,0,0,0,1,0,0
14,1,IP_0152575,60.0,female,anterior torso,5.27,TBP tile: close-up,3D: XP,15.614144,15.523538,...,1,0,0,0,0,0,0,0,1,0


In [251]:
def custom_pauc(y_true, y_pred, min_tpr=0.80):
    max_fpr = abs(1 - min_tpr)
    v_gt = abs(y_true - 1)
    v_pred = 1.0 - y_pred
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

In [252]:
X_test = df_test[unique_feature_cols]
y_test = df_test[target_col]

with open('model.pkl', 'rb') as f:
    estimator = pickle.load(f)

y_pred_estimator = estimator.predict_proba(X_test)[:, 1]

# Compute partial AUC
auc_estimator = custom_pauc(y_test, y_pred_estimator)

print(f"AUC for the tabular model: {auc_estimator:.4f}")

AUC for the tabular model: 0.1648


# Compute pAUC on holdout test set for Hybrid model (too many bugs :( )

In [254]:
# # For the first model (tabular only)
# X_test = df_test[unique_feature_cols]  # Use the deduplicated columns
# y_test = df_test[target_col]

# # # For the second model (hybrid)
# # # First generate embeddings for test set
# # test_cnn = df_test.reset_index().rename(columns={'index': 'isic_id'})
# # test_cnn['isic_id'] = test_cnn['isic_id'].astype(str)
# # generate_cnn_embeddings_hdf5(test_cnn, train_h5, embedding_model, "test_embeddings.csv")
# # test_embeddings = pd.read_csv("test_embeddings.csv", index_col='isic_id')

# # # Combine features with embeddings for test set
# # X_test_hybrid = pd.concat([
# #     df_test[unique_feature_cols],
# #     test_embeddings
# # ], axis=1)

# # # Get predictions from the hybrid model
# # y_pred_estimator_balanced = estimator_balanced.predict_proba(X_test_hybrid)[:, 1]
# # # Evaluate (e.g. AUC)
# # auc_estimator_balanced = roc_auc_score(y_test, y_pred_estimator_balanced)

# # print(f"AUC for the hybrid model: {auc_estimator_balanced:.4f}")