# In this notebook I create two models:
- A tabular only model that is intended to be used to predict target based on only user input and the features generated by my image_feature_extractor.py program (uses OpenCV to extract geometric and color features from image directly).
- A hybrid model that combines my tabular model with a CNN by concatinating CNN feature embeddings with tabular data to retrain the tabular model.

Inspiration for tabular model used in this notebook: https://www.kaggle.com/code/greysky/isic-2024-only-tabular-data

# Setup

In [37]:
import os
import itertools
from pathlib import Path
import time
from tqdm import tqdm
import h5py
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import numpy as np
import joblib
import pandas as pd
import polars as pl
import pickle
import json
from io import BytesIO
import timm
from torchvision import transforms
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.utils.class_weight import compute_sample_weight
from torch.utils.data.sampler import WeightedRandomSampler
import albumentations as A
from albumentations.pytorch import ToTensorV2

from torchvision import models, transforms
from PIL import Image
import io

import warnings
warnings.filterwarnings("ignore")

In [38]:
root = Path('/kaggle/input/isic-2024-challenge')
train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.5 # previously was 0.01
seed = 42

# Data Preparation

In [39]:
user_input_features_num = [
    # User Input Features
    'age_approx',                  # Ask user for age
    'sex',                         # Ask user for sex
    'anatom_site_general',         # Ask user to put in anatomical site (from list of options)
    'clin_size_long_diam_mm',      # Ask user for size of lesion in mm
    'tbp_lv_location',	           # Classification of anatomical location, divides arms & legs to upper & lower; torso into thirds.+
    'tbp_lv_location_simple',	   # Classification of anatomical location, simple.+
]

cv_features = [
    # Shape
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+

    'tbp_lv_minorAxisMM',	             # Smallest lesion diameter (mm).+ AXIS OF LEAST SECOND MOMENT!
    'tbp_lv_eccentricity',               # Eccentricity. (use min_inertia line and max_inertia line)
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    
    # Color Features
    'tbp_lv_A',	                         # A inside lesion.+
    'tbp_lv_Aext',	                     # A outside lesion.+
    'tbp_lv_B',                          # B inside lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    
    'tbp_lv_deltaA',	           # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',               # Color difference
    'tbp_lv_deltaL',	           # Average L contrast (inside vs. outside lesion).+
    # 'tbp_lv_stdL',	               # Standard deviation of L inside lesion.+
    # 'tbp_lv_stdLExt',	           # Standard deviation of L outside lesion.+
    'tbp_lv_deltaLBnorm',          # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    # 'tbp_lv_color_std_mean',	   # Color irregularity, calculated as the variance of colors within the lesion's boundary.

    # Harder to implement
    # 'tbp_lv_radial_color_std_max',	     # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    # 'tbp_lv_symm_2axis',	             # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    # 'tbp_lv_symm_2axis_angle',	         # Lesion border asymmetry angle.+
    # 'tbp_lv_norm_border',	       # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    # 'tbp_lv_norm_color',	       # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+

    
]

derived_features = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt  
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    # 'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    # 'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    # 'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm
    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3
    # 'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    # 'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)

    # # Harder to implement
    # 'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    # 'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
    # 'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max
    # 'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    # 'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    # 'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    # 'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    # 'border_color_interaction_2',
    # 'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    # 'shape_complexity_index',        # border_complexity       + lesion_shape_index
    # 'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    # 'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    # 'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)
    # 'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    # 'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max
]

# Categoric USER INPUT
cat_cols = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']

new_feature_cols = user_input_features_num + cat_cols + cv_features + derived_features

In [40]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()),
        )
        .with_columns(
            # Basic derived features we can calculate
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
        )
        .with_columns(
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            # color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            # consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            # hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
        )
        .with_columns(
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            # color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            # shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
        ) # HARDER ONES AHEAD:
        # .with_columns(
        #     age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
        #     index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        #     color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        #     color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
        #     symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
        #     comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
        #     border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
        #     border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
        #     lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
        #     shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
        #     lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
        #     symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
        #     consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        #     border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
        #     color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        # )
        .to_pandas()
        .set_index('isic_id')
    )

In [41]:
def preprocess(df_train, df_test, cat_cols, new_feature_cols):
    # Cast original categorical columns
    for col in cat_cols:
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')
    
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]
    
    # Directly assign the transformed arrays
    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')
    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')

    # Update feature columns
    updated_feature_cols = [col for col in new_feature_cols if col not in cat_cols]
    updated_feature_cols.extend(new_cat_cols)

    joblib.dump(encoder, 'encoder.pkl')
    with open('feature_columns.json', 'w') as f:
        json.dump({
            "cat_cols": new_cat_cols,
            "new_feature_cols": updated_feature_cols
        }, f)
        
    return df_train, df_test, new_cat_cols, updated_feature_cols

In [42]:
# Read the data
print("Reading data...")
df_train_full = read_data(train_path)
df_competition_test = read_data(test_path)
df_subm = pd.read_csv(subm_path)

print("\nCreating train/test split...")
# Create balanced test set
pos_mask = df_train_full[target_col] == 1
pos_indices = df_train_full[pos_mask].index
neg_indices = df_train_full[~pos_mask].index

test_size = int(0.1 * len(pos_indices))
test_pos_indices = np.random.RandomState(42).choice(pos_indices, size=test_size, replace=False)
test_neg_indices = np.random.RandomState(42).choice(neg_indices, size=test_size, replace=False)
test_indices = np.concatenate([test_pos_indices, test_neg_indices])

# Split the data
df_test = df_train_full.loc[test_indices].copy()
df_train = df_train_full.drop(test_indices).copy()

print("\nTrain/test split sizes:")
print(f"Training set: {len(df_train)} ({sum(df_train[target_col] == 1)} positive)")
print(f"Test set: {len(df_test)} ({sum(df_test[target_col] == 1)} positive)")

# # Quick check of indices after split
# print("\nSample of training set indices after split:")
# print(df_train.index[:5])

Reading data...

Creating train/test split...

Train/test split sizes:
Training set: 400981 (354 positive)
Test set: 78 (39 positive)


In [54]:
# Get 2 positive samples
pos_samples = df_test[df_test[target_col] == 1].index[:5]
print("Positive sample ISICs:")
for isic in pos_samples:
    print(isic)

# Get 2 negative samples
neg_samples = df_test[df_test[target_col] == 0].index[:2]
print("\nNegative sample ISICs:")
for isic in neg_samples:
    print(isic)

# If you want them in a list
sample_isics = list(pos_samples) + list(neg_samples)
print("\nAll samples in a list:")
print(sample_isics)

# You can also see some metadata for these samples
print("\nMetadata for these samples:")
print(df_test.loc[sample_isics, ['target','sex' ,'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_location', 'anatom_site_general']])

Positive sample ISICs:
ISIC_1885694
ISIC_6776067
ISIC_6242031
ISIC_1164461
ISIC_9877311

Negative sample ISICs:
ISIC_9153490
ISIC_0157465

All samples in a list:
['ISIC_1885694', 'ISIC_6776067', 'ISIC_6242031', 'ISIC_1164461', 'ISIC_9877311', 'ISIC_9153490', 'ISIC_0157465']

Metadata for these samples:
              target     sex  age_approx  clin_size_long_diam_mm  \
isic_id                                                            
ISIC_1885694       1    male        60.0                    2.70   
ISIC_6776067       1    male        55.0                    5.15   
ISIC_6242031       1    male        80.0                   16.07   
ISIC_1164461       1                60.0                    1.13   
ISIC_9877311       1  female        50.0                    8.16   
ISIC_9153490       0    male        60.0                    2.60   
ISIC_0157465       0  female        55.0                    3.30   

                      tbp_lv_location anatom_site_general  
isic_id                

In [44]:
# Preprocess the data
print("Preprocessing data...")
df_train, df_test, new_cat_cols, updated_feature_cols = preprocess(
    df_train, df_test, cat_cols, new_feature_cols
)

# Prepare cross-validation data
X = df_train[updated_feature_cols]
y = df_train[target_col]
groups = df_train[group_col]

# Setup cross-validation
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=seed)

print("\nData ready for cross-validation:")
# print(f"Features shape: {X.shape}")
# print(f"Target distribution: {y.value_counts(normalize=True)}")

Preprocessing data...

Data ready for cross-validation:


In [45]:
# X.shape

# Model 1 - Tabular only model

In [46]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'n_jobs':           2,
    'boosting_type':    'gbdt',
    'lambda_l1':        0.03335206514282942, 
    'lambda_l2':        0.005157393323802471, 
    'learning_rate':    0.030665870185795318, 
    'max_depth':        7, 
    'num_leaves':       239, 
    'colsample_bytree': 0.7573175155547233, 
    'colsample_bynode': 0.5005423904042993, 
    'bagging_fraction': 0.7937347683420382, 
    'bagging_freq':     4, 
    'min_data_in_leaf': 29, 
    'scale_pos_weight': 1.648349898918236,
}


estimator = VotingClassifier([
    ('lgb1', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=12)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=12)),
    ])),
    ('lgb2', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=22)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=22)),
    ])),
    ('lgb3', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=32)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=32)),
    ])),
    ('lgb4', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=42)),
    ])),
    ('lgb5', Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=52)),
        ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=52)),
    ])),
], voting='soft')

In [47]:
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

In [48]:
def custom_cross_val(estimator, X, y, cv, groups):
    importances = []
    scores = []
    splits = list(cv.split(X, y, groups))
    
    # Progress bar for folds
    fold_iterator = tqdm(enumerate(splits), 
                        total=5, 
                        desc="Folds",
                        position=0)
    
    # for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    for fold_idx, (train_idx, val_idx) in fold_iterator:
        fold_start_time = time.time()
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Fit the model
        estimator.fit(X_train, y_train)
        
        # Get scores
        score = custom_metric(estimator, X_val, y_val)
        scores.append(score)
        
        # Extract and store feature importances from each LightGBM model
        # fold_importance = np.zeros(len(new_feature_cols))
        fold_importance = np.zeros(len(X.columns))  # Initialize with correct number of features
        
        # for name, pipeline in estimator.named_estimators_.items():
        for name, pipeline in estimator.named_estimators_.items(): 
            lgb_model = pipeline.named_steps['classifier']
            fold_importance += lgb_model.feature_importances_
        
        importances.append(fold_importance / len(estimator.named_estimators_))
        
        fold_time = time.time() - fold_start_time
        print(f"Fold {fold_idx + 1} completed in {fold_time:.2f} seconds")
        
        # Update fold progress bar with timing info
        fold_iterator.set_postfix({'Score': f'{score:.4f}'})
    
    print('\n')
    return np.array(scores), np.array(importances)

In [49]:
# Run cross validation with timing
print("Starting cross-validation...")
total_start_time = time.time()

# Run cross validation
scores, all_fold_importances = custom_cross_val(
    estimator=estimator,
    X=X, 
    y=y,
    cv=cv,
    groups=groups
)

total_time = time.time() - total_start_time
# Print timing results
print(f"\nTraining Complete!")
print(f"Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

# Print scores (equivalent to your original output)
print(f"\nModel Performance:")
print(f"Mean score: {np.mean(scores):.4f}")
print(f"Score std: {np.std(scores):.4f}")
print(f"All scores: {scores}")

# Print feature importances
mean_importances = all_fold_importances.mean(axis=0)
importance_df = pd.DataFrame({
    'feature': updated_feature_cols,
    'importance': mean_importances
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(importance_df.head(20))

Starting cross-validation...


Folds:  20%|██        | 1/5 [00:05<00:21,  5.38s/it, Score=0.1491]

Fold 1 completed in 5.38 seconds


Folds:  40%|████      | 2/5 [00:10<00:15,  5.29s/it, Score=0.1478]

Fold 2 completed in 5.22 seconds


Folds:  60%|██████    | 3/5 [00:16<00:10,  5.48s/it, Score=0.1699]

Fold 3 completed in 5.72 seconds


Folds:  80%|████████  | 4/5 [00:22<00:05,  5.93s/it, Score=0.1687]

Fold 4 completed in 6.61 seconds


Folds: 100%|██████████| 5/5 [00:28<00:00,  5.72s/it, Score=0.1375]

Fold 5 completed in 5.67 seconds



Training Complete!
Total time: 29.89 seconds (0.50 minutes)

Model Performance:
Mean score: 0.1546
Score std: 0.0127
All scores: [0.14911161 0.14784121 0.16991863 0.16867588 0.13748807]

Top 20 Most Important Features:
                      feature  importance
1      clin_size_long_diam_mm      121.32
13                   tbp_lv_H      116.64
18              tbp_lv_deltaB      108.80
34   overall_color_difference       99.04
20         tbp_lv_deltaLBnorm       96.12
23               hue_contrast       94.56
8                 tbp_lv_Aext       91.40
32        mean_hue_difference       89.60
3          tbp_lv_perimeterMM       86.56
29       color_contrast_index       84.48
7                    tbp_lv_A       74.68
4          tbp_lv_minorAxisMM       73.36
2              tbp_lv_areaMM2       70.96
35  size_color_contrast_ratio       70.12
31     normalized_lesion_size       69.88
21          lesion_size_ratio       69.56
28       size_age_interaction  




In [50]:
# df_train, df_test, new_cat_cols, updated_feature_cols = preprocess(
#     df_train, df_test, cat_cols, new_feature_cols
# )

X, y = df_train[updated_feature_cols], df_train[target_col] 

print(X.shape)

estimator.fit(X, y)

(400981, 76)


In [51]:
with open('model.pkl', 'wb') as file:
    pickle.dump(estimator, file)

# Model 2 - Hybrid Model (CNN + Tabular Model)

In [26]:
train_h5 = root / 'train-image.hdf5'
test_h5 = root / 'test-image.hdf5'

# sampling_ratio = 1 # Sampling ratio for hybrid model

In [27]:
class ImageFeatureExtractor(nn.Module):
    """ResNet18 based feature extractor"""
    def __init__(self):
        super().__init__()
        # Load pretrained ResNet18
        resnet = models.resnet18(pretrained=True)
        # Remove the final classification layer
        self.features = nn.Sequential(*list(resnet.children())[:-1])
        
    def forward(self, x):
        # Extract features and remove extra dimensions
        x = self.features(x)
        return x.squeeze()

In [28]:
def create_augmentations():
    """Create augmentation pipeline for positive samples"""
    return A.Compose([
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ColorJitter(
            brightness=0.2,
            contrast=0.2,
            saturation=0.2,
            hue=0.1,
            p=0.5
        ),
        # Careful with these since they might affect medical features
        A.GaussNoise(p=0.2),
        A.Blur(p=0.2),
    ])

def extract_image_features(h5_path, df_index, target_series=None, batch_size=32, device='cuda', augment_positives=True):
    """Extract features from images in batches with optional augmentation"""
    model = ImageFeatureExtractor().to(device)
    model.eval()
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Setup augmentations if needed
    aug = create_augmentations() if augment_positives else None
    
    features_list = []
    valid_ids = []
    error_ids = []
    
    with h5py.File(h5_path, 'r') as h5_file:
        for i in tqdm(range(0, len(df_index), batch_size), desc="Extracting image features"):
            batch_ids = df_index[i:i + batch_size]
            batch_images = []
            batch_valid_ids = []
            
            for isic_id in batch_ids:
                try:
                    # Load image
                    img_bytes = h5_file[isic_id][()]
                    image = Image.open(io.BytesIO(img_bytes)).convert('RGB')
                    image = np.array(image)
                    
                    # Apply augmentation for positive samples
                    if augment_positives and target_series is not None and target_series[isic_id] == 1:
                        # Create 2 augmented versions of each positive sample
                        augmented = [aug(image=image)['image'] for _ in range(2)]
                        for aug_img in augmented:
                            aug_tensor = transform(Image.fromarray(aug_img))
                            batch_images.append(aug_tensor)
                            batch_valid_ids.append(isic_id)  # Duplicate ID for augmented samples
                    
                    # Always include original image
                    image_tensor = transform(Image.fromarray(image))
                    batch_images.append(image_tensor)
                    batch_valid_ids.append(isic_id)
                    
                except KeyError:
                    error_ids.append(isic_id)
                    continue
                except Exception as e:
                    print(f"Error processing image {isic_id}: {str(e)}")
                    error_ids.append(isic_id)
                    continue
            
            if not batch_images:
                continue
                
            batch_tensor = torch.stack(batch_images).to(device)
            
            with torch.no_grad():
                batch_features = model(batch_tensor).cpu().numpy()
            
            features_list.append(batch_features)
            valid_ids.extend(batch_valid_ids)
            
            del batch_tensor
            torch.cuda.empty_cache()
    
    if not features_list:
        print(f"\nFailed to find any valid images out of {len(df_index)} requested.")
        print(f"Error IDs sample: {error_ids[:10]}")
        raise ValueError("No valid images found")
        
    return np.vstack(features_list), valid_ids

In [29]:
def get_sampling_ratio(n_pos, n_neg, n_augmentations=3):
    total_pos = n_pos * (1 + n_augmentations)  # original + augmented
    if (total_pos / n_neg) > 0.5:
        print("Warning: Sampling ratio is high!")
    return min(0.5, total_pos / n_neg)  # minimum ratio of 0.01

In [30]:
def get_sampling_ratio(n_pos, n_neg, n_augmentations=3):
    """
    Calculate sampling ratio bounded between 0.01 and 0.5
    """
    total_pos = n_pos * (1 + n_augmentations)  # original + augmented
    raw_ratio = total_pos / n_neg
    
    # Bound the ratio between 0.01 and 0.5
    ratio = max(0.01, min(0.5, raw_ratio))
    
    # Add warning messages for visibility
    if raw_ratio > 0.5:
        print(f"Warning: Raw sampling ratio {raw_ratio:.3f} was capped to 0.5")
    elif raw_ratio < 0.01:
        print(f"Warning: Raw sampling ratio {raw_ratio:.3f} was raised to 0.01")
        
    return ratio

In [31]:
def train_and_evaluate_hybrid_model(df_train, df_test, train_h5, sample_size=None):
    """Train and evaluate both original and hybrid models"""
    
    if sample_size is not None:
        # First determine minimum number of positive samples needed
        min_pos_samples = 50  # Set a minimum to ensure each fold has positive samples
        
        # Sample positive cases first
        pos_mask = df_train[target_col] == 1
        pos_indices = df_train[pos_mask].index
        neg_indices = df_train[~pos_mask].index
        
        # Take all positive samples if we have fewer than min_pos_samples
        n_pos = min(len(pos_indices), max(min_pos_samples, sample_size // 10))
        # Calculate corresponding negative samples to maintain rough class balance
        n_neg = min(len(neg_indices), sample_size - n_pos)
        
        print(f"Sampling {n_pos} positive and {n_neg} negative cases...")
        
        sampled_pos = np.random.choice(pos_indices, size=n_pos, replace=False)
        sampled_neg = np.random.choice(neg_indices, size=n_neg, replace=False)
        
        sampled_indices = np.concatenate([sampled_pos, sampled_neg])
        np.random.shuffle(sampled_indices)  # Shuffle to mix positive and negative cases
        df_train = df_train.loc[sampled_indices]
    
    print("\nStarting with data sizes:")
    print(f"Training set: {len(df_train)} samples ({sum(df_train[target_col] == 1)} positive)")
    print(f"Class ratio (neg/pos): {(len(df_train) - sum(df_train[target_col] == 1)) / sum(df_train[target_col] == 1):.2f}")
    print(f"Test set: {len(df_test)} samples ({sum(df_test[target_col] == 1)} positive)")

    total_start_time = time.time()
    
    # Extract CNN features with better error handling
    print("\nExtracting CNN features for training set...")
    try:
        train_cnn_features, train_valid_ids = extract_image_features(
            train_h5, 
            df_train.index,
            target_series=df_train[target_col],  # Pass targets for augmentation
            batch_size=32,
            augment_positives=True  # Enable augmentation
        )
        print(f"Successfully extracted features for {len(train_valid_ids)} training images (including augmentations)")
        
        print("\nExtracting CNN features for test set...")
        test_cnn_features, test_valid_ids = extract_image_features(
            train_h5,
            df_test.index,
            augment_positives=False,  # No augmentation for test set
            batch_size=32
        )
        print(f"Successfully extracted features for {len(test_valid_ids)} test images")
        
    except ValueError as e:
        print("\nError during feature extraction. Checking HDF5 files...")
        with h5py.File(train_h5, 'r') as f:
            print(f"\nTraining HDF5 structure:")
            print(f"Number of images: {len(f.keys())}")
            print(f"Sample keys: {list(f.keys())[:5]}")
            
        with h5py.File(train_h5, 'r') as f:
            print(f"\nTest HDF5 structure:")
            print(f"Number of images: {len(f.keys())}")
            print(f"Sample keys: {list(f.keys())[:5]}")
            
        raise ValueError("Feature extraction failed. Check HDF5 file paths and contents.") from e

    # Create feature names for CNN features
    cnn_feature_names = [f'cnn_feature_{i}' for i in range(train_cnn_features.shape[1])]
    
    # Create DataFrames with CNN features
    train_cnn_df = pd.DataFrame(
        train_cnn_features, 
        index=train_valid_ids,
        columns=cnn_feature_names
    )
    
    test_cnn_df = pd.DataFrame(
        test_cnn_features,
        index=test_valid_ids,
        columns=cnn_feature_names
    )
    
    # Combine with tabular features
    X_train_hybrid = pd.concat([
        df_train.loc[train_valid_ids, updated_feature_cols],
        train_cnn_df
    ], axis=1)
    
    X_test_hybrid = pd.concat([
        df_test.loc[test_valid_ids, updated_feature_cols],
        test_cnn_df
    ], axis=1)
    
    y_train_hybrid = df_train.loc[train_valid_ids, target_col]
    y_test_hybrid = df_test.loc[test_valid_ids, target_col]
    groups_train_hybrid = df_train.loc[train_valid_ids, group_col]
    
    print("\nHybrid feature matrix shapes:")
    print(f"Training: {X_train_hybrid.shape}")
    print(f"Testing: {X_test_hybrid.shape}")

    sampling_ratio = get_sampling_ratio(n_pos, n_neg)
    
    # Create hybrid model with same structure as original but with updated sampling ratio
    hybrid_model = VotingClassifier([
        ('lgb1', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=12)),
            ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=12)),
        ])),
        ('lgb2', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=22)),
            ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=22)),
        ])),
        ('lgb3', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=32)),
            ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=32)),
        ])),
        ('lgb4', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)),
            ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=42)),
        ])),
        ('lgb5', Pipeline([
            ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=52)),
            ('classifier', lgb.LGBMClassifier(**lgb_params, random_state=52)),
        ])),
    ], voting='soft')
    
    # Cross-validation for hybrid model
    print("\nEvaluating hybrid model...")
    hybrid_scores, hybrid_importances = custom_cross_val(
        estimator=hybrid_model,
        X=X_train_hybrid,
        y=y_train_hybrid,
        cv=cv,
        groups=groups_train_hybrid
    )

    total_time = time.time() - total_start_time
    print(f"\nTraining Complete!")
    print(f"Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
    
    print(f"Hybrid model mean CV score: {np.mean(hybrid_scores):.4f} (±{np.std(hybrid_scores):.4f})")
    
    # Feature importance analysis
    mean_importances = hybrid_importances.mean(axis=0)
    importance_df = pd.DataFrame({
        'feature': list(X_train_hybrid.columns),
        'importance': mean_importances
    }).sort_values('importance', ascending=False)
    
    print("\nTop 20 Most Important Features:")
    print(importance_df.head(20))
    
    # Train final hybrid model
    print("\nTraining final hybrid model...")
    hybrid_model.fit(X_train_hybrid, y_train_hybrid)
    
    # Save the hybrid model and feature info
    print("\nSaving hybrid model...")
    
    model_artifacts = {
        'model': hybrid_model,
        'feature_columns': list(X_train_hybrid.columns),
        'cnn_feature_names': cnn_feature_names,
        'feature_extractor': ImageFeatureExtractor().to('cpu'),  # Save CNN model too
        'transform': transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    }

    with open('hybrid_model_full.pkl', 'wb') as f:
        pickle.dump(model_artifacts, f)


    # with open('hybrid_model.pkl', 'wb') as f:
    #     pickle.dump({
    #         'model': hybrid_model,
    #         'feature_columns': list(X_train_hybrid.columns),
    #         'cnn_feature_names': cnn_feature_names
    #     }, f)
    print("\nSaved Succesfully!")
    
    return hybrid_scores, importance_df

In [32]:
if __name__ == "__main__":
    
    hybrid_scores, importance_df = train_and_evaluate_hybrid_model(
        df_train, df_test,
        train_h5,
        # sample_size= len(df_train)  # take subset of df_train or entire df_train
        sample_size= 1000
    )

Sampling 100 positive and 900 negative cases...

Starting with data sizes:
Training set: 1000 samples (100 positive)
Class ratio (neg/pos): 9.00
Test set: 78 samples (39 positive)

Extracting CNN features for training set...


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 151MB/s] 
Extracting image features: 100%|██████████| 32/32 [00:13<00:00,  2.36it/s]


Successfully extracted features for 1200 training images (including augmentations)

Extracting CNN features for test set...


Extracting image features: 100%|██████████| 3/3 [00:00<00:00,  3.22it/s]


Successfully extracted features for 78 test images

Hybrid feature matrix shapes:
Training: (1200, 588)
Testing: (78, 588)

Evaluating hybrid model...


Folds:  20%|██        | 1/5 [00:10<00:42, 10.63s/it, Score=0.1730]

Fold 1 completed in 10.63 seconds


Folds:  40%|████      | 2/5 [00:21<00:31, 10.60s/it, Score=0.1691]

Fold 2 completed in 10.58 seconds


Folds:  60%|██████    | 3/5 [00:29<00:19,  9.74s/it, Score=0.1441]

Fold 3 completed in 8.71 seconds


Folds:  80%|████████  | 4/5 [00:38<00:09,  9.12s/it, Score=0.1592]

Fold 4 completed in 8.16 seconds


Folds: 100%|██████████| 5/5 [00:48<00:00,  9.74s/it, Score=0.1618]

Fold 5 completed in 10.61 seconds



Training Complete!
Total time: 64.56 seconds (1.08 minutes)
Hybrid model mean CV score: 0.1615 (±0.0100)

Top 20 Most Important Features:
                      feature  importance
1      clin_size_long_diam_mm       79.00
13                   tbp_lv_H       68.40
32        mean_hue_difference       56.32
14                tbp_lv_Hext       50.24
34   overall_color_difference       38.44
18              tbp_lv_deltaB       36.20
2              tbp_lv_areaMM2       33.28
4          tbp_lv_minorAxisMM       33.08
28       size_age_interaction       27.40
26    perimeter_to_area_ratio       27.04
29       color_contrast_index       26.92
3          tbp_lv_perimeterMM       25.08
109            cnn_feature_33       22.00
27    area_to_perimeter_ratio       21.28
389           cnn_feature_313       20.88
30            log_lesion_area       19.72
361           cnn_feature_285       19.40
23               hue_contrast       18.96
86             cnn_feature_





Saving hybrid model...

Saved Succesfully!


# Evaluate on both models on holdout test set

In [33]:
import pickle
import numpy as np
from sklearn.metrics import roc_auc_score
import h5py
from PIL import Image
import io
import torch
import torch.nn as nn
from torchvision import transforms
from tqdm import tqdm

def custom_metric_test(y_true, y_pred):
    """Calculate partial AUC for test set predictions"""
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_pred])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

def evaluate_tabular_model(model_path, df_test, feature_cols=None):
    """Evaluate the tabular-only model"""
    # Load the model
    with open(model_path, 'rb') as f:
        model_data = pickle.load(f)
    
    # Handle both dictionary and direct model cases
    if isinstance(model_data, dict):
        model = model_data['model']
        feature_cols = model_data.get("feature_columns") or model_data.get("new_feature_cols")
    else:
        model = model_data
        if feature_cols is None:
            # If no feature columns provided, use the ones from df_test
            feature_cols = [col for col in df_test.columns if col != 'target' and col != 'patient_id']
    
    # Prepare test features
    X_test = df_test[feature_cols]
    y_test = df_test['target']
    
    # Get predictions
    y_pred = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    score = custom_metric_test(y_test, y_pred)
    
    return score, y_pred

def evaluate_hybrid_model(model_path, df_test, test_h5_path, batch_size=32, device='cuda'):
    """Evaluate the hybrid model"""
    # Load the hybrid model and all necessary components
    with open(model_path, 'rb') as f:
        model_data = pickle.load(f)
    
    model = model_data['model']
    feature_columns = model_data['feature_columns']
    cnn_feature_names = model_data['cnn_feature_names']
    feature_extractor = model_data['feature_extractor'].to(device)
    transform = model_data['transform']
    
    feature_extractor.eval()
    
    # Extract CNN features for test set
    print("Extracting CNN features for test set...")
    test_cnn_features, test_valid_ids = [], []
    
    with h5py.File(test_h5_path, 'r') as h5_file:
        for i in tqdm(range(0, len(df_test), batch_size)):
            batch_ids = df_test.index[i:i + batch_size]
            batch_images = []
            batch_valid_ids = []
            
            for isic_id in batch_ids:
                try:
                    # Load and process image
                    img_bytes = h5_file[isic_id][()]
                    image = Image.open(io.BytesIO(img_bytes)).convert('RGB')
                    image_tensor = transform(image)
                    batch_images.append(image_tensor)
                    batch_valid_ids.append(isic_id)
                except Exception as e:
                    print(f"Error processing image {isic_id}: {str(e)}")
                    continue
            
            if not batch_images:
                continue
                
            # Process batch
            batch_tensor = torch.stack(batch_images).to(device)
            with torch.no_grad():
                batch_features = feature_extractor(batch_tensor).cpu().numpy()
            
            test_cnn_features.append(batch_features)
            test_valid_ids.extend(batch_valid_ids)
            
            del batch_tensor
            torch.cuda.empty_cache()
    
    # Combine features
    test_cnn_features = np.vstack(test_cnn_features)
    test_cnn_df = pd.DataFrame(
        test_cnn_features,
        index=test_valid_ids,
        columns=cnn_feature_names
    )
    
    # Create final feature matrix
    X_test_hybrid = pd.concat([
        df_test.loc[test_valid_ids, [col for col in feature_columns if col not in cnn_feature_names]],
        test_cnn_df
    ], axis=1)
    
    y_test = df_test.loc[test_valid_ids, 'target']
    
    # Get predictions
    y_pred = model.predict_proba(X_test_hybrid)[:, 1]
    
    # Calculate metrics
    score = custom_metric_test(y_test, y_pred)
    
    return score, y_pred, test_valid_ids

def compare_models(df_test, tabular_model_path, hybrid_model_path, test_h5_path, feature_cols=None):
    """Compare both models on the test set"""
    print("Evaluating tabular model...")
    tab_score, tab_pred = evaluate_tabular_model(tabular_model_path, df_test, feature_cols)
    
    print("\nEvaluating hybrid model...")
    hyb_score, hyb_pred, valid_ids = evaluate_hybrid_model(
        hybrid_model_path, df_test, test_h5_path
    )
    
    # Create comparison DataFrame
    comparison_df = pd.DataFrame({
        'True_Label': df_test.loc[valid_ids, 'target'],
        'Tabular_Pred': tab_pred[df_test.index.isin(valid_ids)],  # Filter to match valid_ids
        'Hybrid_Pred': hyb_pred
    })
    
    print("\nModel Comparison Results:")
    print(f"Tabular Model pAUC: {tab_score:.4f}")
    print(f"Hybrid Model pAUC:  {hyb_score:.4f}")
    
    return comparison_df

In [55]:
# import os
# # print("\nContents of /kaggle directory:")
# # print(os.listdir('/kaggle'))

# # print("\nContents of Model directory (if it exists):")
# # if os.path.exists('/kaggle/input/models/pytorch/default/1'):
# #     print(os.listdir('/kaggle/input/models/pytorch/default/1'))

# # print("\nContents of /kaggle/working directory:")
# # print(os.listdir('/kaggle/working'))

# if os.path.exists('/kaggle/working/'):
#     print(os.listdir('/kaggle/working'))


In [52]:
# First make sure we have the feature columns
print("Feature columns we're using:")
print(f"Number of features: {len(updated_feature_cols)}")
print(f"Sample features: {updated_feature_cols[:5]}")

comparison_df = compare_models(
    df_test=df_test,
    tabular_model_path='/kaggle/working/model.pkl',
    # hybrid_model_path='/kaggle/input/models/pytorch/default/1/hybrid_model_full.pkl',
    hybrid_model_path='/kaggle/working/hybrid_model_full.pkl',
    test_h5_path=train_h5,
    feature_cols=updated_feature_cols
)

Feature columns we're using:
Number of features: 76
Sample features: ['age_approx', 'clin_size_long_diam_mm', 'tbp_lv_areaMM2', 'tbp_lv_perimeterMM', 'tbp_lv_minorAxisMM']
Evaluating tabular model...

Evaluating hybrid model...
Extracting CNN features for test set...


100%|██████████| 3/3 [00:00<00:00,  3.38it/s]



Model Comparison Results:
Tabular Model pAUC: 0.1695
Hybrid Model pAUC:  0.1285


In [18]:
# First let's look at what we actually saved
import pickle

print("Checking tabular model contents:")
with open('/kaggle/input/models/pytorch/default/1/model.pkl', 'rb') as f:
    tabular_model = pickle.load(f)
print(f"Tabular model type: {type(tabular_model)}")

print("\nChecking hybrid model contents:")
with open('/kaggle/input/models/pytorch/default/1/hybrid_model_full.pkl', 'rb') as f:
    hybrid_model = pickle.load(f)
print(f"Hybrid model type: {type(hybrid_model)}")
print(f"Hybrid model keys: {hybrid_model.keys() if isinstance(hybrid_model, dict) else 'Not a dictionary'}")

Checking tabular model contents:
Tabular model type: <class 'sklearn.ensemble._voting.VotingClassifier'>

Checking hybrid model contents:
Hybrid model type: <class 'dict'>
Hybrid model keys: dict_keys(['model', 'feature_columns', 'cnn_feature_names', 'feature_extractor', 'transform'])


# Training Final Models

### Final Model 1 - Tabular Only

In [None]:
# df_train_full_processed, df_competition_test_processed, new_cat_cols, updated_feature_cols = preprocess(
#     df_train_full, df_competition_test, cat_cols, new_feature_cols
# )

# X, y = df_train_full_processed[updated_feature_cols], df_train_full_processed[target_col] # UPDATE TO FULL TRAINING SET !

# estimator.fit(X, y)

# with open('model.pkl', 'wb') as file:
#     pickle.dump(estimator, file)

### Final Model 2 - Hybrid

In [None]:
# Not building until test and CV scores improve...