# <b><span style='color:#F1A424'>|</span> RUN ONLY ON GPU <span style='color:#F1A424'>|</span></b>

In [None]:
import warnings
warnings.filterwarnings('ignore')

import re
import gc
import os
import io
import copy
import timm
import h5py
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import defaultdict

import torch
from torch import nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision import datasets, transforms
from torch.optim import Adam, SGD, lr_scheduler
from torch.utils.data import Subset, Dataset, DataLoader, ConcatDataset

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score 
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from imblearn.pipeline import Pipeline
from sklearn.utils import class_weight

from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
import catboost as cb
import xgboost as xgb

## Building the main dataframe and creating the paths to the metadata

In [None]:
# Load data
kaggle_path = '/kaggle/input/isic-2024-challenge'
train_metadata_path = os.path.join(kaggle_path, 'train-metadata.csv')
test_metadata_path = os.path.join(kaggle_path, 'test-metadata.csv')
train_image_path = os.path.join(kaggle_path, 'train-image.hdf5')
test_image_path = os.path.join(kaggle_path, 'test-image.hdf5')

isic_2024_metadata_df = pd.read_csv(train_metadata_path)
isic_2024_metadata_test_df = pd.read_csv(test_metadata_path)

isic_2024_metadata_df

## Creating Real Validation

In [None]:
benign_df_temporary    = isic_2024_metadata_df[isic_2024_metadata_df['target'] == 0].reset_index(drop=True)
malignant_df_temporary = isic_2024_metadata_df[isic_2024_metadata_df['target'] == 1].reset_index(drop=True)

In [None]:
benign_df_untouchable    = benign_df_temporary.iloc[30_000:].reset_index(drop=True)
malignant_df_untouchable = malignant_df_temporary.iloc[100:].reset_index(drop=True)

In [None]:
benign_df_untouchable.head(5)

In [None]:
malignant_df_untouchable.head(5)

In [None]:
untouchable_2024_df = pd.concat([malignant_df_untouchable, benign_df_untouchable]).reset_index(drop=True)

In [None]:
untouchable_2024_df['target'].value_counts()

In [None]:
benign_df_trainable    = benign_df_temporary.iloc[:30_000].reset_index(drop=True)
malignant_df_trainable = malignant_df_temporary.iloc[:300].reset_index(drop=True)

In [None]:
isic_2024_metadata_df = pd.concat([malignant_df_trainable, benign_df_trainable]).sample(frac=1).reset_index(drop=True)

In [None]:
isic_2024_metadata_df['target'].value_counts()

## Back to the old code

In [None]:
isic_2024_metadata_df.describe(include='all')

## Device initialization

In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# <b>1 <span style='color:#F1A424'>|</span> 2024 METADATA Pre-processing <span style='color:#F1A424'>|</span></b>

## Feature Engineering functions


In [None]:
# Extracting features and labels from the DataFrame
features_cat = ["sex", "tbp_tile_type", "tbp_lv_location", "tbp_lv_location_simple", "anatom_site_general",'patient_id']

features_num = [
    'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 
    'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 
    'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 
    'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
    'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM',
    'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
    'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
    'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
    'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z',
]
useless_features = ['lesion_id','attribution', 'copyright_license', 'image_type','iddx_full','iddx_1','iddx_2','iddx_3','iddx_4','iddx_5','mel_mitotic_index']
forbiden_features = ['mel_thick_mm','tbp_lv_dnn_lesion_confidence']
target = ['target']

In [None]:
def feature_engineering(df):
    # New features to try...
    df["lesion_size_ratio"] = df["tbp_lv_minorAxisMM"] / df["clin_size_long_diam_mm"]
    df["lesion_shape_index"] = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2)
    df["hue_contrast"] = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"] = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"] = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"] = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"] = df["tbp_lv_color_std_mean"] / df["tbp_lv_radial_color_std_max"]
    df["3d_position_distance"] = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2) 
    df["perimeter_to_area_ratio"] = df["tbp_lv_perimeterMM"] / df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"] = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["symmetry_border_consistency"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"] = df["tbp_lv_stdL"] / df["tbp_lv_Lext"]
    df["size_age_interaction"] = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"] = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"] = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"] = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"] = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"] = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"] = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"] = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"] = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"] = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"] = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"] = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"] = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    new_num_cols = [
        "lesion_size_ratio", "lesion_shape_index", "hue_contrast",
        "luminance_contrast", "lesion_color_difference", "border_complexity",
        "color_uniformity", "3d_position_distance", "perimeter_to_area_ratio",
        "lesion_visibility_score", "symmetry_border_consistency", "color_consistency",

        "size_age_interaction", "hue_color_std_interaction", "lesion_severity_index", 
        "shape_complexity_index", "color_contrast_index", "log_lesion_area",
        "normalized_lesion_size", "mean_hue_difference", "std_dev_contrast",
        "color_shape_composite_index", "3d_lesion_orientation", "overall_color_difference",
        "symmetry_perimeter_interaction", "comprehensive_lesion_index",
    ]
    return df, new_num_cols

## Feature engineering and encoding 2024

In [None]:
isic_2024_metadata_engineered, new_num_cols = feature_engineering(isic_2024_metadata_df.copy())

num_cols = features_num + new_num_cols
train_cols = num_cols + features_cat 

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

category_encoder = OrdinalEncoder(
    categories='auto', # The encoder will automatically determine the categories for each feature.
    dtype=int, # ouput them as integers
    handle_unknown='use_encoded_value', # The encoder will use a specified integer value for these unknown categories.
    unknown_value=-2, # which is -2 for unknown values
    encoded_missing_value=-1, # and -1 for encoded missing value
)

X_cat = category_encoder.fit_transform(isic_2024_metadata_engineered[features_cat])
for c, cat_col in enumerate(features_cat):
    isic_2024_metadata_engineered[cat_col] = X_cat[:, c]

isic_2024_metadata_engineered = isic_2024_metadata_engineered.replace([np.inf, -np.inf], np.nan).fillna(0)

In [None]:
# Drop useless features
isic_2024_metadata_engineered = isic_2024_metadata_engineered[train_cols + ['target','isic_id']]
isic_2024_metadata_engineered

## Displaying feature correlation

In [None]:
corr_data = isic_2024_metadata_engineered[num_cols + features_cat + ['target']].copy()
dataplot = sns.heatmap(corr_data.corr(), cmap="YlGnBu", annot=False)
plt.show()

In [None]:
correlations = isic_2024_metadata_engineered[train_cols+['target']].corr().loc[:, 'target'].abs()
sorted_correlations = correlations.sort_values(ascending=False)
print(sorted_correlations.apply(lambda x: f"{x*100:.2f}%"))
top_correlated_features = sorted_correlations.index.tolist()[1:6]
print("Most correlated features to the target :", top_correlated_features)

# <b>2 <span style='color:#F1A424'>|</span> 2024 IMAGE Preprocessing <span style='color:#F1A424'>|</span></b>


## ISIC 2024 Image Loader Class

In [None]:
class ImageLoaderWithMetadata(Dataset):
    def __init__(self, df, file_hdf, transform=None, subset=None, has_target=True):
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.transform = transform
        self.has_target = has_target
        
        if subset is not None and subset[1]-subset[0] < len(df):
            self.df = df.iloc[subset[0]:subset[1]].reset_index(drop=True)
        else:
            self.df = df
        
        self.isic_ids = self.df['isic_id'].values
        
        if self.has_target:
            self.targets = self.df['target'].values
        
    def __len__(self):
        return len(self.isic_ids)
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        image = Image.open(BytesIO(self.fp_hdf[isic_id][()]))
        
        if self.transform:
            image = np.array(image)
            transformed = self.transform(image=image)
            image = transformed['image']
            image = image / 255 
                
        if self.has_target:
            target = self.targets[index]
            return (image, target)
        else:
            return image


### Image displaying functions

In [None]:
import matplotlib.pyplot as plt
from numpy import random    

def display_random_grid_images(dataset, n=3, seed=None):
    if seed is not None:
        random.seed(seed)
    
    # Get n*n unique random indices
    indices = random.choice(len(dataset), n*n, replace=False)
    
    # Create a grid of subplots
    fig, axs = plt.subplots(n, n, figsize=(10, 10))
    
    # Flatten the axs array to iterate easily
    axs = axs.flatten()
    
    # Select random images and display them
    for i, idx in enumerate(indices):
        img, target = dataset[idx]
        isic_id = dataset.isic_ids[idx]
        if isinstance(img, torch.Tensor):
            img = img.permute(1, 2, 0).numpy()

        axs[i].imshow(img)
        axs[i].axis('off')
        target_name = 'Benign' if target==0 else 'Malignant'
        axs[i].set_title(f'ID: {isic_id} \n Target: {target_name}')
    
    # Hide any extra subplots
    for j in range(n*n, len(axs)):
        axs[j].axis('off')
    
    plt.tight_layout()
    plt.show()

## Data Augmentation functions


In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

image_size = (137, 137) # (height, width)


train_transform_and_augment = A.Compose([
    A.SmallestMaxSize(max_size=137),  # Resize to a smaller dimension while keeping aspect ratio
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.01, rotate_limit=30, p=1),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),   
    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.5, p=0.8),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),  # Slight R, G and B shift
    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10, p=0.1),  # Slight changes to hue, saturation, and value (brightness)
    
    A.Resize(image_size[0], image_size[1]),  # Resize to the target size
    ToTensorV2(),  # Convert the image to a PyTorch tensor
])

train_transform_no_augment = A.Compose([
    A.Resize(image_size[0], image_size[1]),  # Resize to the target size
    ToTensorV2(),  # Convert the image to a PyTorch tensor
])

## Image Loading and Augmentation

In [None]:
isic_2024_benign_df = isic_2024_metadata_engineered[isic_2024_metadata_engineered['target'] == 0].reset_index(drop=True)
isic_2024_malignant_df = isic_2024_metadata_engineered[isic_2024_metadata_engineered['target'] == 1].reset_index(drop=True)

In [None]:
# Define the sample models
subset_models = [(0, 10_000), (10_000, 20_000), (20_000, 30_000)]

# Function to create benign datasets for a given model
def create_benign_datasets(subset):
    benign_no_augment = ImageLoaderWithMetadata(df=isic_2024_benign_df, file_hdf=train_image_path, transform=train_transform_no_augment, subset=subset)
    benign_augment = ImageLoaderWithMetadata(df=isic_2024_benign_df, file_hdf=train_image_path, transform=train_transform_and_augment, subset=subset)
    return benign_no_augment, benign_augment

# Function to create malignant datasets with augmentations
def create_malignant_datasets():
    malignant_base = ImageLoaderWithMetadata(df=isic_2024_malignant_df, file_hdf=train_image_path, transform=train_transform_no_augment)
    malignant_aug_1 = ImageLoaderWithMetadata(df=isic_2024_malignant_df, file_hdf=train_image_path, transform=train_transform_and_augment)
    malignant_aug_2 = ImageLoaderWithMetadata(df=isic_2024_malignant_df, file_hdf=train_image_path, transform=train_transform_and_augment)
    malignant_aug_3 = ImageLoaderWithMetadata(df=isic_2024_malignant_df, file_hdf=train_image_path, transform=train_transform_and_augment)
    return [malignant_base, malignant_aug_1, malignant_aug_2, malignant_aug_3]

# Create benign datasets for each model
benign_dataset_model_1, _ = create_benign_datasets(subset_models[0])
benign_dataset_model_2, _ = create_benign_datasets(subset_models[1])
benign_dataset_model_3, _ = create_benign_datasets(subset_models[2])

# Create malignant datasets
malignant_dataset_model_1 = create_malignant_datasets()
malignant_dataset_model_2 = create_malignant_datasets()
malignant_dataset_model_3 = create_malignant_datasets()

# <b>3 <span style='color:#F1A424'>|</span> 2018-19-20 IMAGE Pre-processing <span style='color:#F1A424'>|</span></b>

## Loading Paths and DataFrames

In [None]:
# Load data 2020
train_2020_metadata_path = '/kaggle/input/isic-2020-jpg-256x256-resized/train-metadata.csv'
train_2019_metadata_path = '/kaggle/input/isic-2019-jpg-256x256-resized/train-metadata.csv'
train_2018_metadata_path = '/kaggle/input/isic-2018-jpg-256x256-resized/train-metadata.csv'

train_2020_jpg_image_path = '/kaggle/input/isic-2020-jpg-256x256-resized/train-image/image'
train_2019_jpg_image_path = '/kaggle/input/isic-2019-jpg-256x256-resized/train-image/image'
train_2018_jpg_image_path = '/kaggle/input/isic-2018-jpg-256x256-resized/train-image/image'

isic_2020_metadata_df = pd.read_csv(train_2020_metadata_path).drop(columns=['Unnamed: 0','patient_id'],inplace=False)
isic_2019_metadata_df = pd.read_csv(train_2019_metadata_path).drop(columns=['Unnamed: 0','patient_id'],inplace=False)
isic_2018_metadata_df = pd.read_csv(train_2018_metadata_path).drop(columns=['Unnamed: 0','patient_id'],inplace=False)

isic_2020_metadata_df

### Removing corrupted ISIC-2018 Images

In [None]:
tqdm.pandas()

import tensorflow as tf

check_path = lambda p: tf.io.gfile.exists(p)
isic_2018_metadata_df['target'] = isic_2018_metadata_df['target'].astype(int) # 0.0 -> 0
isic_2018_metadata_df.dropna(axis=0, inplace=True)

print("\nChecking 2018 image files ...")
isic_2018_metadata_df['exists'] = (train_2018_jpg_image_path+'/'+isic_2018_metadata_df['isic_id']+'.jpg').progress_apply(check_path)
isic_2018_metadata_df['exists'].value_counts()
isic_2018_metadata_df = isic_2018_metadata_df[isic_2018_metadata_df['exists'] == True].drop(columns=['exists'],inplace=False).reset_index()
isic_2018_metadata_df.drop(columns=['index'],inplace=True)
isic_2018_metadata_df

### Keep as much of benign as of malignant for 2020-2019-2018

In [None]:
def keep_balanced(df):
    malignant_df = df[df['target']==1]
    benign_df = df[df['target']==0].iloc[:len(malignant_df.index)]
    return pd.concat([malignant_df,benign_df]).reset_index(drop=True)
    

In [None]:
isic_2020_metadata_df = keep_balanced(isic_2020_metadata_df)
isic_2019_metadata_df = keep_balanced(isic_2019_metadata_df)
isic_2018_metadata_df = keep_balanced(isic_2018_metadata_df)

In [None]:
print(isic_2020_metadata_df['target'].value_counts())
print(isic_2019_metadata_df['target'].value_counts())
print(isic_2018_metadata_df['target'].value_counts())

### Prediction proper to 2018-2019-2020

In [None]:
train_transform_and_augment_for_jpgs = A.Compose([
    A.SmallestMaxSize(max_size=137),  # Resize to a smaller dimension while keeping aspect ratio
    
    ## Match 2024 Look 

    A.RandomBrightnessContrast(brightness_limit=(-0.1, 0), contrast_limit=(-0.3, 0), p=0.8),  # Reduce brightness and contrast
    A.GaussianBlur(blur_limit=(1, 3), p=1),
    A.RandomGamma(gamma_limit=(80, 120), p=1),
    A.GaussNoise(var_limit=(10.0, 20.0), p=1),
    A.MotionBlur(blur_limit=(3, 7), p=0.5),  # Introduce motion blur to simulate hand-held camera movement
    A.OpticalDistortion(distort_limit=0.05, shift_limit=0.05, p=0.5),  # Introduce optical distortion

    # Real Augmentation
    
    A.ShiftScaleRotate(shift_limit=0.2, scale_limit=0.01, rotate_limit=30, p=1),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),    
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.3),  # Slight R, G and B shift
    A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=10, val_shift_limit=10, p=0.1),  # Slight changes to hue, saturation, and value (brightness)
    
    
    A.Resize(image_size[0], image_size[1]),  # Resize to the target size
    ToTensorV2(),  # Convert the image to a PyTorch tensor
])

## ISIC 2018-19-20 Image Loader Class

In [None]:
class ISICDataset_jpgs(Dataset):
    def __init__(self, df, image_path, has_target=False, transform=None, reference_columns=None, missing_value=-1):
        self.df = df
        self.label = df['target']
        self.isic_ids = df['isic_id']
        self.transform = transform
        self.img_path = os.path.join(image_path)
        self.has_target = has_target
        if self.has_target:
            self.targets = self.df['target'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img_path = os.path.join(self.img_path, isic_id + '.jpg')  
        image = Image.open(img_path)
        
        if self.transform:
            image = np.array(image)
            transformed = self.transform(image=image)  # Apply transformation
            image = transformed['image']
            image = image / 255 
                
        if self.has_target:
            target = self.targets[index]
            return (image, target) 
        else:
            return image
        
    def get_labels(self):
        return self.label

In [None]:
image_2020_dataset_aug = ISICDataset_jpgs(df = isic_2020_metadata_df, image_path = train_2020_jpg_image_path, has_target = True, transform=train_transform_and_augment_for_jpgs)
image_2019_dataset_aug = ISICDataset_jpgs(df = isic_2019_metadata_df, image_path = train_2019_jpg_image_path, has_target = True, transform=train_transform_and_augment_for_jpgs)
image_2018_dataset_aug = ISICDataset_jpgs(df = isic_2018_metadata_df, image_path = train_2018_jpg_image_path, has_target = True, transform=train_transform_and_augment_for_jpgs)

## Image Displaying And Comparision

In [None]:
seed = 42

display_random_grid_images(malignant_dataset_model_1[1],      n=3, seed=seed)
display_random_grid_images(image_2020_dataset_aug, n=3, seed=seed)
display_random_grid_images(image_2019_dataset_aug, n=3, seed=seed)
display_random_grid_images(image_2018_dataset_aug, n=3, seed=seed)

In [None]:
print(f'Number of 2020 samples in the augmented dataset : {len(image_2020_dataset_aug.isic_ids)}')
print(f'Number of 2019 samples in the augmented dataset : {len(image_2019_dataset_aug.isic_ids)}')
print(f'Number of 2018 samples in the augmented dataset : {len(image_2018_dataset_aug.isic_ids)}')

## DataFrames Concatenation

I created a class that in addition to concatenate the datasets, it creates a target attribute to the object

In [None]:
class ConcatDatasetWithMetadataAndTarget(ConcatDataset):
    def __init__(self, datasets):
        super().__init__(datasets) 
        self.targets = np.concatenate([dataset.targets for dataset in datasets])
    
    def __getitem__(self, idx):
        image, target = super().__getitem__(idx)
        
        target = self.targets[idx]
        return image, target

In [None]:
train_concat_previous_years_dataset = ConcatDatasetWithMetadataAndTarget([image_2020_dataset_aug, image_2019_dataset_aug, image_2018_dataset_aug])

In [None]:
# Trying without previous years
# train_2024_model_1 = ConcatDatasetWithMetadataAndTarget([*benign_dataset_model_1, *malignant_dataset_model_1, train_concat_previous_years_dataset])
# train_2024_model_2 = ConcatDatasetWithMetadataAndTarget([*benign_dataset_model_2, *malignant_dataset_model_2, train_concat_previous_years_dataset])
# train_2024_model_3 = ConcatDatasetWithMetadataAndTarget([*benign_dataset_model_3, *malignant_dataset_model_3, train_concat_previous_years_dataset])

train_previous_years_1 = ConcatDatasetWithMetadataAndTarget([train_concat_previous_years_dataset])
train_previous_years_2 = ConcatDatasetWithMetadataAndTarget([train_concat_previous_years_dataset])
train_previous_years_3 = ConcatDatasetWithMetadataAndTarget([train_concat_previous_years_dataset])

train_2024_model_1 = ConcatDatasetWithMetadataAndTarget([benign_dataset_model_1, *malignant_dataset_model_1])
train_2024_model_2 = ConcatDatasetWithMetadataAndTarget([benign_dataset_model_2, *malignant_dataset_model_2])
train_2024_model_3 = ConcatDatasetWithMetadataAndTarget([benign_dataset_model_3, *malignant_dataset_model_3])


In [None]:
#train_final_dataset = ConcatDatasetWithMetadataAndTarget([train_concat_2024_dataset, train_concat_previous_years_dataset])
#train_final_dataset = train_concat_2024_dataset

In [None]:
print('Total number of images :', len(train_2024_model_1))

# <b> 5 <span style='color:#F1A424'>|</span> CNN Training <span style='color:#F1A424'>|</span></b>

## Creating Training Functions and Classes

### Definition of Generalized Mean Pooling (GeM)

$\text{GeM}(x) = \left( \frac{1}{H \times W} \sum_{i=1}^{H} \sum_{j=1}^{W} (\max(x(i,j), \epsilon))^p \right)^{\frac{1}{p}}$

With $x$ with shape $(N, C, H, W)$

We recognize $ \text{GeM}(x) = ∥max(x,ϵ)∥_p$

This process allows the GeM layer to interpolate between average pooling (when $p=1$) and max pooling (as $p \to \infty$). For $p > 1$, it emphasizes larger values in the feature map more than average pooling does.

In [None]:
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps # epsilon to avoid division by 0

    def gem(self, x, p=3, eps=1e-6):
        clamped_x = x.clamp(min=eps) # Operation quite straight forward looking at the formula
        pow_clamped_x = clamped_x.pow(p)
        average_pool = F.avg_pool2d(pow_clamped_x, (x.size(-2), x.size(-1)))
        gem = average_pool.pow(1/p)
        return gem
    
    def forward(self, x):
        return self.gem(x, p = self.p, eps = self.eps) # Apply the gem function seen before
        
    def __repr__(self):
        # hard function just to output : GeM(p=3.0000, eps=1e-06)
        # with obviously the current p and eps values
        return  self.__class__.__name__ +\
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

In [None]:
class ISICModel(nn.Module):
    def __init__(self, model_name, num_metadata_features, num_classes=1, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model = timm.create_model(model_name, checkpoint_path=checkpoint_path)
        in_features = self.model.classifier.in_features
        self.in_features = in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        
        self.pooling = GeM()
        self.agressive_dropout  = nn.Dropout(0.5)
        self.soft_dropout       = nn.Dropout(0.2)
        self.linear             = nn.Linear(in_features, num_classes)
        self.sigmoid            = nn.Sigmoid()

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        output = self.sigmoid(self.linear(pooled_features))
        return output

model_name = 'efficientnet_b0'
num_metadata_features = 65
checkpoint_path = '/kaggle/input/tf-efficientnet/pytorch/tf-efficientnet-b0/1/tf_efficientnet_b0_aa-827b6e33.pth'

cnn_model_1 = ISICModel(model_name, num_metadata_features, checkpoint_path=checkpoint_path)
cnn_model_2 = ISICModel(model_name, num_metadata_features, checkpoint_path=checkpoint_path)
cnn_model_3 = ISICModel(model_name, num_metadata_features, checkpoint_path=checkpoint_path)
cnn_model_1 = cnn_model_1.to(device)
cnn_model_2 = cnn_model_2.to(device)
cnn_model_3 = cnn_model_3.to(device)

### Weighted Loss function

In [None]:
def criterion(outputs, targets):
    weights = 1 + 4 * targets
    outputs = outputs.view(-1, 1)  # Ensure outputs have shape (N, 1)
    loss = nn.BCELoss()(outputs, targets) * weights.sum()
    return loss  # Binary Loss function

## Building the Training functions

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    torch.cuda.empty_cache()
    gc.collect()
    
    dataset_size = 0
    running_loss = 0.0
    running_auroc  = 0.0
    running_corrects = 0.0
    running_recall = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, targets) in bar:
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float).unsqueeze(1)
        
        batch_size = images.size(0)
        
        outputs = model(images).squeeze()
        
        loss = criterion(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        outputs_cpu = outputs.detach().cpu().numpy()
        targets_cpu = targets.cpu().numpy()
        
        try:
            auroc = roc_auc_score(targets_cpu, outputs_cpu, max_fpr=0.8).item()
        except Exception as e:
            auroc = 0.0
        
        preds = (outputs_cpu > 0.5).astype(int)
        accuracy  = accuracy_score(targets_cpu, preds)
        recall    = recall_score(targets_cpu, preds)
        
        running_loss += (loss.item() * batch_size)
        running_auroc  += (auroc * batch_size)
        running_corrects += (accuracy * batch_size)
        running_recall += (recall * batch_size)
        dataset_size += batch_size
        
        epoch_loss   = running_loss     / dataset_size
        epoch_auroc  = running_auroc    / dataset_size
        epoch_acc    = running_corrects / dataset_size
        epoch_recall = running_recall   / dataset_size
        
        if scheduler is not None:
            scheduler.step(epoch_loss)
        
        bar.set_postfix(Epoch=epoch, Train_Acc=f"{epoch_acc:.2f}%", Train_Recall=f"{epoch_recall:.2f}%", Train_Loss=epoch_loss, Train_Auroc=epoch_auroc,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss, epoch_auroc, epoch_acc, epoch_recall


In [None]:
# Disabling gradient computation and saving memory
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch, optimizer):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    running_auroc = 0.0
    running_corrects = 0.0
    running_recall = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, (images, targets) in bar: 
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float).unsqueeze(1)
        
        batch_size = images.size(0)

        outputs = model(images).squeeze()
        
        loss = criterion(outputs, targets)
        
        outputs_cpu = outputs.detach().cpu().numpy()
        targets_cpu = targets.cpu().numpy()
        
        try:
            auroc = roc_auc_score(targets_cpu, outputs_cpu, max_fpr=0.8).item()
        except Exception as e:
            auroc = 0.0
        
        preds = (outputs_cpu > 0.5).astype(int)
        
        accuracy = accuracy_score(targets_cpu, preds)
        recall    = recall_score(targets_cpu, preds)
        
        running_loss += (loss.item() * batch_size)
        running_auroc  += (auroc * batch_size)
        running_corrects += (accuracy * batch_size)
        running_recall += (recall * batch_size)
        dataset_size += batch_size
        
        epoch_loss   = running_loss     / dataset_size
        epoch_auroc  = running_auroc    / dataset_size
        epoch_acc    = running_corrects / dataset_size
        epoch_recall = running_recall   / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Acc=f"{epoch_acc:.2f}%", Valid_Recall=f"{epoch_recall:.2f}%", Valid_Loss=epoch_loss, Valid_Auroc=epoch_auroc,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss, epoch_auroc, epoch_acc, epoch_recall

In [None]:
import time

def run_training(model, optimizer, scheduler, device, num_epochs, train_loader, valid_loader):
    # Confirm that it is running on GPU
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    
    # Deep copies the initial model weights to save the best model later
    best_model_wts = copy.deepcopy(model.state_dict())
    
    # Initializes the best AUROC to 0.
    best_epoch_auroc = 0
    
    # Initializa dictionary (Better way than history = {} because history['Train Loss'].append(train_epoch_loss) is possible even is 'Train Loss' isn't a key yet)
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        
        # Train for one epoch
        train_epoch_loss, train_epoch_auroc, train_epoch_acc, train_epoch_recall = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=device, epoch=epoch)
        # Valid for one epoch
        val_epoch_loss, val_epoch_auroc, val_epoch_acc, val_epoch_recall = valid_one_epoch(model, valid_loader, optimizer = optimizer, device=device, 
                                         epoch=epoch)
        
        # Save the loss and score in the history dict
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        history['Train AUROC'].append(train_epoch_auroc)
        history['Valid AUROC'].append(train_epoch_acc)
        history['Train Acc'].append(train_epoch_auroc)
        history['Valid Acc'].append(val_epoch_acc)
        history['Train Recall'].append(train_epoch_recall)
        history['Valid Recall'].append(val_epoch_recall)
        history['lr'].append(optimizer.param_groups[0]['lr'])
        
        # Save the model if it's getting better results
        # if best_epoch_auroc <= val_epoch_auroc:
        if True : # just got rid of this criterium
            print(f"{best_epoch_auroc} Validation AUROC Improved ({best_epoch_auroc} ---> {val_epoch_auroc})")
            
            # Updates best auroc
            best_epoch_auroc = val_epoch_auroc
            
            # Deepcopy the weights
            best_model_wts = copy.deepcopy(model.state_dict())
            
            # Saves the weights in the working directory
            PATH = "/kaggle/working/AUROC{:.4f}_Loss{:.4f}_epoch{:.0f}.bin".format(val_epoch_auroc, val_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            
            print(f"Model Saved")
            
        print()
    
    end = time.time()
    
    # Display the training time
    time_elapsed = end - start # in seconds
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600 , (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best AUROC: {:.4f}".format(best_epoch_auroc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

## Cosine Annealing Learning Rate Formula

The learning rate $\eta_t$ at time $t$ is computed as:

$
\eta_t = \eta_{\min} + \frac{1}{2} (\eta_{\max} - \eta_{\min}) \left(1 + \cos\left(\frac{T_{cur}}{T_{max}} \pi\right)\right)
$

Where:
- $\eta_t$ is the learning rate at epoch $t$.
- $\eta_{\min}$ is the minimum learning rate (`eta_min`).
- $\eta_{\max}$ is the initial (maximum) learning rate (starting learning rate of the optimizer).
- $T_{cur}$ is the current number of iterations (epochs) completed.
- $T_{max}$ is the maximum number of iterations (epochs) for one cycle.


In [None]:
def init_opti_and_scheduler(model):
    optimizer = Adam(model.parameters(), 
                    lr=1e-4, 
                    weight_decay=1e-6)

    scheduler = lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=1400, # 2 times the number of steps 
        eta_min=1e-6) # Minimum learning Rate Value
    
    return optimizer, scheduler

# optimizer_temp, scheduler_temp = init_opti_and_scheduler(cnn_temp_model)

## Dataset Splitting

In [None]:
batch_size = 64 

train_loader_model_1 = DataLoader(train_2024_model_1, batch_size=batch_size, shuffle=True , num_workers=4)
train_loader_model_2 = DataLoader(train_2024_model_2, batch_size=batch_size, shuffle=True , num_workers=4)
train_loader_model_3 = DataLoader(train_2024_model_3, batch_size=batch_size, shuffle=True , num_workers=4)

In [None]:
valid_engineered, _ = feature_engineering(untouchable_2024_df.copy())
cnn_validation_dataset  = ImageLoaderWithMetadata(df=valid_engineered, file_hdf=train_image_path, transform=train_transform_no_augment, has_target=True, subset=(0, 10_000))
validation_dataset  = ImageLoaderWithMetadata(df=valid_engineered, file_hdf=train_image_path, transform=train_transform_no_augment, has_target=True)
cnn_validation_loader = DataLoader(cnn_validation_dataset, batch_size=batch_size, shuffle=True , num_workers=4)
validation_loader   = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True , num_workers=4)

## Launch Training

In [None]:
optimizer_1, scheduler_1 = init_opti_and_scheduler(cnn_model_1)
optimizer_2, scheduler_2 = init_opti_and_scheduler(cnn_model_2)
optimizer_3, scheduler_3 = init_opti_and_scheduler(cnn_model_3)

In [None]:
print('Training Model 1 ...')
cnn_model_model_1, history_1 = run_training(cnn_model_1, 
                              optimizer_1, 
                              scheduler_1,
                              train_loader = train_loader_model_1, 
                              valid_loader = cnn_validation_loader,
                              device=device,
                              num_epochs = 5)

print('Training Model 2 ...')
cnn_model_model_2, history_2 = run_training(cnn_model_2, 
                              optimizer_2, 
                              scheduler_2,
                              train_loader = train_loader_model_2, 
                              valid_loader = cnn_validation_loader,
                              device=device,
                              num_epochs = 5)

print('Training Model 3 ...')
cnn_model_model_3, history_3 = run_training(cnn_model_3, 
                              optimizer_3, 
                              scheduler_3,
                              train_loader = train_loader_model_3, 
                              valid_loader = cnn_validation_loader,
                              device=device,
                              num_epochs = 5)

## CNN Training Results Visualization

In [None]:
history = pd.DataFrame.from_dict(history_3)
history.to_csv('history.csv',index=False)

In [None]:
plt.plot(range(history.shape[0]), history["Train Acc"].values, label="Train Acc")
plt.plot(range(history.shape[0]), history["Valid Acc"].values, label="Valid Acc")
plt.xlabel("epochs")
plt.ylabel("Accuracy")
plt.xlim(0, history.shape[0])
plt.ylim(0, 1)
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot(range(history.shape[0]), history["Train Loss"].values, label="Train Loss")
plt.plot(range(history.shape[0]), history["Valid Loss"].values, label="Valid Loss")
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.xlim(0, history.shape[0])
plt.ylim(0, np.max(history["Train Loss"].values))
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(history.shape[0]), history["Train Recall"].values, label="Train Recall")
plt.plot( range(history.shape[0]), history["Valid Recall"].values, label="Valid Recall")
plt.xlabel("epochs")
plt.ylabel("Recall")
plt.xlim(0, history.shape[0])
plt.ylim(0, 1.0)
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(history.shape[0]), history["Train AUROC"].values, label="Train AUROC")
plt.plot( range(history.shape[0]), history["Valid AUROC"].values, label="Valid AUROC")
plt.xlabel("epochs")
plt.ylabel("AUROC")
plt.xlim(0, history.shape[0])
plt.ylim(0, 1.0)
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(history.shape[0]), history["lr"].values, label="lr")
plt.xlabel("epochs")
plt.ylabel("lr")
plt.grid()
plt.legend()
plt.show()

## Displaying Mistaken predictions

In [None]:
def get_mistaken_predictions(model, dataloader, device):
    model.eval()
    mistaken_images = {'false_positives': [], 'false_negatives': []}
    
    with torch.inference_mode():
        for images, targets in dataloader:
            images = images.to(device, dtype=torch.float)
            targets = targets.to(device, dtype=torch.float)
            outputs = model(images).squeeze()
            preds = (outputs > 0.5).int()
            
            for i in range(len(preds)):
                if preds[i] != targets[i]:
                    if preds[i] == 1 and targets[i] == 0:
                        mistaken_images['false_positives'].append((images[i].cpu(), targets[i].cpu()))
                    elif preds[i] == 0 and targets[i] == 1:
                        mistaken_images['false_negatives'].append((images[i].cpu(), targets[i].cpu()))
    
    return mistaken_images


In [None]:
def display_mistaken_images(mistaken_images, category='false_positives', n=5):
    images_to_display = mistaken_images[category][:n]
    fig, axs = plt.subplots(1, n, figsize=(20, 5))
    for i, (img, target) in enumerate(images_to_display):
        if isinstance(img, torch.Tensor):
            img = img.permute(1, 2, 0).numpy()
        axs[i].imshow(img)
        axs[i].axis('off')
        if category == 'false_positives':
            axs[i].set_title(f'Target: Benign \n Prediction : Malignant', fontsize=16)
        else : 
            axs[i].set_title(f'Target: Malignant \n Prediction : Benign', fontsize=16)

    plt.tight_layout()
    plt.show()

In [None]:
mistaken_images = get_mistaken_predictions(cnn_model_1, cnn_validation_loader, device)
print("Displaying false positives:")
display_mistaken_images(mistaken_images, 'false_positives', n=5)
print("Displaying false negatives:")
display_mistaken_images(mistaken_images, 'false_negatives', n=5)

# <b>6 <span style='color:#F1A424'>|</span> Metadata Gradient Boosting <span style='color:#F1A424'>|</span></b>

## AUC-ROC

In [None]:
def comp_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80):
    v_gt = abs(np.asarray(solution.values)-1)
    v_pred = np.array([1.0 - x for x in submission.values])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return partial_auc

## CNN Predictions

In [None]:
@torch.inference_mode()
def GradBoost_predictions(model, dataloader, device):
    model.eval()
    torch.cuda.empty_cache()
    predictions_list = []
    
    for step, (images, target) in tqdm(enumerate(dataloader), desc='Loading predictions...', total = len(dataloader)):  
        images = images.to(device, dtype=torch.float)
        outputs = model(images).squeeze()
        if outputs.dim() == 1:
            predictions_list.extend(outputs.tolist())
        else:
            predictions_list.extend(outputs.squeeze().tolist())
    return predictions_list

In [None]:
isic_2024_metadata_df = pd.read_csv(train_metadata_path)
Grad_boost_full_df = untouchable_2024_df
# Grad_boost_full_df = isic_2024_metadata_df
Grad_boost_full_df_engineered, _ = feature_engineering(Grad_boost_full_df.copy())

In [None]:
full_Grad_boost_dataset = ImageLoaderWithMetadata(df=Grad_boost_full_df_engineered, file_hdf=train_image_path, transform=train_transform_no_augment)
len(full_Grad_boost_dataset)

In [None]:
# Keep Full DataSet ??
GradBoost_Loader = DataLoader(full_Grad_boost_dataset, shuffle=True, batch_size=batch_size, num_workers=4)

In [None]:
cnn_preds_1 = GradBoost_predictions(cnn_model_1, GradBoost_Loader, device)
cnn_preds_2 = GradBoost_predictions(cnn_model_2, GradBoost_Loader, device)
cnn_preds_3 = GradBoost_predictions(cnn_model_3, GradBoost_Loader, device)

In [None]:
cnn_predictions_df = pd.DataFrame({
    'isic_id': full_Grad_boost_dataset.isic_ids,
    'prediction_score_model_1': cnn_preds_1,
    'prediction_score_model_2': cnn_preds_2,
    'prediction_score_model_3': cnn_preds_3,
})

In [None]:
# Merge with the original test metadata
# merged_df = pd.merge(isic_2024_metadata_df, cnn_predictions_df, on='isic_id')
merged_df = pd.merge(untouchable_2024_df, cnn_predictions_df, on='isic_id')
# Display the merged DataFrame
merged_df

In [None]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
df_train, new_num_cols = feature_engineering(merged_df.copy())
num_cols = features_num + new_num_cols
train_cols = num_cols + features_cat + ['prediction_score_model_1','prediction_score_model_2','prediction_score_model_3']
# train_cols = num_cols + features_cat

In [None]:
def engineer_predictions(df):
    # Convert DataFrame columns to PyTorch tensors and move them to GPU
    pred1 = torch.tensor(df['prediction_score_model_1'].values, device=device, dtype=torch.float32)
    pred2 = torch.tensor(df['prediction_score_model_2'].values, device=device, dtype=torch.float32)
    pred3 = torch.tensor(df['prediction_score_model_3'].values, device=device, dtype=torch.float32)

    # Calculate ratios using GPU
    ratio_1_over_2 = pred1 / pred2
    ratio_2_over_3 = pred2 / pred3
    ratio_3_over_1 = pred3 / pred1

    # Move the calculated ratios back to CPU and convert to numpy arrays
    df['ratio_1_over_2'] = ratio_1_over_2.cpu().numpy()
    df['ratio_2_over_3'] = ratio_2_over_3.cpu().numpy()
    df['ratio_3_over_1'] = ratio_3_over_1.cpu().numpy()

    return df

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold

category_encoder = OrdinalEncoder(
    categories='auto', # The encoder will automatically determine the categories for each feature.
    dtype=int, # ouput them as integers
    handle_unknown='use_encoded_value', # The encoder will use a specified integer value for these unknown categories.
    unknown_value=-2, # which is -2 for unknown values
    encoded_missing_value=-1, # and -1 for encoded missing value
)

X_cat = category_encoder.fit_transform(df_train[features_cat])
for c, cat_col in enumerate(features_cat):
    df_train[cat_col] = X_cat[:, c]

df_train = df_train.replace([np.inf, -np.inf], np.nan).fillna(0)

# Standard scaling for numerical features
scaler = StandardScaler()
df_train[train_cols] = scaler.fit_transform(df_train[train_cols])

# train_cols += ['ratio_1_over_2','ratio_2_over_3','ratio_3_over_1']

# df_train = engineer_predictions(df_train)

In [None]:
df_train[train_cols].describe()

In [None]:
df_train[train_cols]

In [None]:
df_train_malignant = df_train[df_train['target']==1][train_cols]
df_train_benign = df_train[df_train['target']==0][train_cols]

In [None]:
number_of_fold_trainings = 5 # 10*0.5 to prevent overfitting

In [None]:
gkf = GroupKFold(n_splits=10)

df_train["fold"] = -1 # Setting temp value
# gkf.split is generating splits for cross-validation
# groups=df_train["patient_id"] ensures that the data is split such that the same patient doesn't appear in both training and validation sets.
#  train_idx, val_idx are two arrays containing the indices for the training and validation sets
for idx, (train_idx, val_idx) in enumerate(gkf.split(df_train, df_train["target"], groups=df_train["patient_id"])):
    df_train.loc[val_idx, "fold"] = idx # Assign a fold number from 0 to 9
    print(f"Fold {idx}: {len(val_idx)} validation samples")

## XGBoost

In [None]:
# Gamma : Specifies the minimum loss reduction required to make a split.
# Number of leaves 
# Lambda L2
# Bagging frequency
# min child samples 

In [None]:
%%time

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
torch.cuda.empty_cache()

best_params_xgb = {
    'objective': 'binary:logistic', # Binary classification
    'eval_metric': 'logloss', # Loss function
    'colsample_bytree': 0.8684, 
    'lambda': 20, 
    'learning_rate': 0.6098, 
    'max_depth': 10, 
    'n_estimators': 2727, 
    'reg_alpha': 5.5721, 
    'reg_lambda': 25.9040, 
    'subsample': 1.0,
    'nthread': 4,
    'random_state': 42,
    'tree_method': 'gpu_hist',
    'verbosity': 0 # Silent mode
    
}

xgb_scores = []
xgb_models = []

for fold in range(0,number_of_fold_trainings):
    # If fold = 1
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True) # _df_train contains every fold except 1
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True) # _df_valid is the first fold
    model = xgb.XGBClassifier(**best_params_xgb)
    model.fit(_df_train[train_cols], _df_train["target"]) 
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")

    print(f"fold: {fold} - ROC AUC Score: {score:.5f}")
    xgb_scores.append(score)
    xgb_models.append(model)
    

In [None]:
xgb_score = np.mean(xgb_scores)
print(f"XGBoost Average ROC AUC Score: {xgb_score:.5f}")

In [None]:
importances = np.mean([model.feature_importances_ for model in xgb_models], axis=0)
df_imp = pd.DataFrame({"feature": train_cols, "importance": importances}).sort_values("importance").reset_index(drop=True)

plt.figure(figsize=(16, 12))
plt.barh(df_imp["feature"], df_imp["importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show()

## CAT Model

In [None]:
%%time
import catboost as cb
torch.cuda.empty_cache()

cb_scores = []
cb_models = []

cb_params = {
    'objective': 'Logloss',
    "random_state": 42,
    "colsample_bylevel": 0.3,
    "iterations": 400,
    "learning_rate": 0.05,
    "max_depth": 8,
    "l2_leaf_reg": 5,
    "scale_pos_weight": 2,
    "verbose": 0,
}

for fold in tqdm(range(1,number_of_fold_trainings+1), desc= 'Training over Folds', total = 5):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    #model = cb.CatBoostClassifier(**cb_params)
    model = VotingClassifier([(f"cb_{i}", cb.CatBoostClassifier(**cb_params)) for i in range(3)], voting="soft")
    model.fit(_df_train[train_cols], _df_train["target"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    cb_scores.append(score)
    cb_models.append(model)

In [None]:
cb_score = np.mean(cb_scores)
print(f"CatBoost Score: {cb_score:.5f}")

In [None]:
'''
importances = np.mean([model.get_feature_importance() for model in cb_models], axis=0)
df_imp = pd.DataFrame({"feature": train_cols, "importance": importances}).sort_values("importance").reset_index(drop=True)

plt.figure(figsize=(16, 12))
plt.barh(df_imp["feature"], df_imp["importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importances from CatBoost Model")
plt.show()
'''


## LightGBM

In [None]:
%%time
import lightgbm as lgb

lgb_params = {
    "objective": "binary",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 200,
    'learning_rate': 0.05,    
    'lambda_l1': 0.0004, 
    'lambda_l2': 8.7652, 
    'num_leaves': 136, 
    'feature_fraction': 0.5392, 
    'bagging_fraction': 0.9577, 
    'bagging_freq': 6,
    'min_child_samples': 60,
    "device": "gpu"
}


lgb_scores = []
lgb_models = []

for fold in tqdm(range(1,number_of_fold_trainings+1), desc= 'Training over Folds', total = 5):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    model = lgb.LGBMClassifier(**lgb_params)
    model = VotingClassifier([(f"lgb_{i}", lgb.LGBMClassifier(random_state=i, **lgb_params)) for i in range(7)], voting="soft")
    # model.fit(_df_train[train_cols], _df_train["target"])
    preds = model.predict_proba(_df_valid[train_cols])[:, 1]
    score = comp_score(_df_valid[["target"]], pd.DataFrame(preds, columns=["prediction"]), "")
    print(f"fold: {fold} - Partial AUC Score: {score:.5f}")
    lgb_scores.append(score)
    lgb_models.append(model)

In [None]:
lgbm_score = np.mean(lgb_scores)
print(f"LGBM Score: {lgbm_score:.5f}")

In [None]:
'''
importances = np.mean([model.feature_importances_ for model in lgb_models], 0)
df_imp = pd.DataFrame({"feature": model.feature_name_, "importance": importances}).sort_values("importance").reset_index(drop=True)

plt.figure(figsize=(16, 12))
plt.barh(df_imp["feature"], df_imp["importance"])
plt.show()
'''

# <b>7 <span style='color:#F1A424'>|</span> Weighting predictions <span style='color:#F1A424'>|</span></b>

Out-of-Fold (OOF) predictions are used in ensemble methods like stacking to create unbiased training data for the meta-model. Here's why OOF predictions are important and beneficial:

In [None]:
@torch.inference_mode()
def evaluation_predictions(model, dataloader, device, fold):
    model.eval()
    torch.cuda.empty_cache()
    predictions_list = []
    
    for step, (images, metadata, target) in tqdm(enumerate(dataloader), total = len(dataloader), desc=f'Predicting fold [{fold+1}/5]'):   
        images = images.to(device, dtype=torch.float)
        metadata = metadata.to(device, dtype=torch.float)
        outputs = model(images, metadata).squeeze()
        if outputs.dim() == 1:
            predictions_list.extend(outputs.tolist())
        else:
            predictions_list.extend(outputs.squeeze().tolist())
    return predictions_list

In [None]:
train_preds = []
valid_preds = []
valid_targets = []

for fold in range(1,number_of_fold_trainings+1):
    _df_train = df_train[df_train["fold"] != fold].reset_index(drop=True)
    _df_valid = df_train[df_train["fold"] == fold].reset_index(drop=True)
    
    # Predictions for LGBM
    lgb_model = lgb_models[fold-1]
    lgb_valid_preds = lgb_model.predict_proba(_df_valid[train_cols])[:, 1]
    
    # Predictions for CatBoost
    cb_model = cb_models[fold-1]
    cb_valid_preds = cb_model.predict_proba(_df_valid[train_cols])[:, 1]
    
    # Predictions for XGBoost
    xgb_model = xgb_models[fold-1]
    xgb_valid_preds = xgb_model.predict_proba(_df_valid[train_cols])[:, 1]
    
    # Aggregate predictions
    valid_preds.append(np.column_stack((lgb_valid_preds, cb_valid_preds, xgb_valid_preds)))
    valid_targets.append(_df_valid["target"].values)

valid_preds = np.vstack(valid_preds)
valid_targets = np.hstack(valid_targets)

In [None]:
from sklearn.metrics import roc_auc_score
from itertools import product

# Define a range of weights to test
weights = np.arange(0, 1.1, 0.1)
best_score = 0
best_weights = (0.6, 0.4, 0) # (LGBM, CAT, XGB)
w1, w2, w3 = best_weights


for w1, w2, w3 in tqdm(product(weights, repeat=3), desc ='Testing every combination', total = len(list(product(weights, repeat=3)))):
    if w1 + w2 + w3 == 1:  # Ensure the weights sum to 1
        meta_valid_preds = w1 * valid_preds[:, 0] + w2 * valid_preds[:, 1] + w3 * valid_preds[:, 2]
        score = roc_auc_score(valid_targets, meta_valid_preds)
        if score > best_score  and w3 < 0.2 and w1 < 0.6 and 0.0 not in (w1, w2, w3):
            best_score = score
            best_weights = (w1, w2, w3)

print(f"Best weights (LGBM, CAT, XGB): {best_weights}, Best ROC AUC Score: {score:.5f}")


# <b>8 <span style='color:#F1A424'>|</span> Submission <span style='color:#F1A424'>|</span></b>

## Test MetaData loading and encoding

In [None]:
metadata_and_features_test_df = pd.read_csv(test_metadata_path)

## Test ImageLoader

In [None]:
@torch.inference_mode()
def test_predictions(model, dataloader, device):
    model.eval()
    torch.cuda.empty_cache()
    predictions_list = []
    
    for step, (images) in enumerate(dataloader):   
        images = images.to(device, dtype=torch.float)
        outputs = model(images).squeeze()
        if step == 0:
            print(outputs.tolist()[:5])
        if outputs.dim() == 1:
            predictions_list.extend(outputs.tolist())
        else:
            predictions_list.extend(outputs.squeeze().tolist())
    return predictions_list

In [None]:
test_dataset = ImageLoaderWithMetadata(metadata_and_features_test_df, test_image_path, transform=train_transform_no_augment, has_target=False)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size, num_workers=4)

## Getting CNN predictions

In [None]:
cnn_test_preds_1 = test_predictions(
    model=cnn_model_1, 
    dataloader=test_loader,
    device=device
)

cnn_test_preds_2 = test_predictions(
    model=cnn_model_2, 
    dataloader=test_loader,
    device=device
)

cnn_test_preds_3 = test_predictions(
    model=cnn_model_3, 
    dataloader=test_loader,
    device=device
)

In [None]:
cnn_test_df = pd.DataFrame({
    'isic_id': test_dataset.isic_ids,
    'prediction_score_model_1': cnn_test_preds_1,
    'prediction_score_model_2': cnn_test_preds_2,
    'prediction_score_model_3': cnn_test_preds_3,
})

In [None]:
cnn_test_df

In [None]:
# Merge with the original test metadata
merged_test_df = pd.merge(metadata_and_features_test_df, cnn_test_df, on='isic_id')
# Display the merged DataFrame
merged_test_df = merged_test_df

## Getting Grandient Boosting Predictions

In [None]:
train_cols = num_cols + features_cat + ['prediction_score_model_1','prediction_score_model_2','prediction_score_model_3']

In [None]:
df_test, new_num_cols = feature_engineering(merged_test_df.copy())

category_encoder = OrdinalEncoder(
    categories='auto', # The encoder will automatically determine the categories for each feature.
    dtype=int, # ouput them as integers
    handle_unknown='use_encoded_value', # The encoder will use a specified integer value for these unknown categories.
    unknown_value=-2, # which is -2 for unknown values
    encoded_missing_value=-1, # and -1 for encoded missing value
)

X_cat = category_encoder.fit_transform(df_test[features_cat])
for c, cat_col in enumerate(features_cat):
    df_test[cat_col] = X_cat[:, c]
    
df_test = df_test.replace([np.inf, -np.inf], np.nan).fillna(0)

df_test[train_cols] = scaler.fit_transform(df_test[train_cols])

# train_cols = num_cols + features_cat + ['ratio_1_over_2','ratio_2_over_3','ratio_3_over_1']

# df_test = engineer_predictions(df_test)

In [None]:
df_test[train_cols]

In [None]:
lgb_preds = np.mean([model.predict_proba(df_test[train_cols])[:, 1] for model in lgb_models], 0)
cb_preds = np.mean([model.predict_proba(df_test[train_cols])[:, 1] for model in cb_models], 0)
xgb_preds = np.mean([model.predict_proba(df_test[train_cols])[:, 1] for model in xgb_models], 0)

In [None]:
w1, w2, w3 = best_weights
# final_preds = (w1 * lgb_preds + w2 * cb_preds + w3 * xgb_preds)*0.66 + 0.34 * cnn_test_preds

final_preds = (w1 * lgb_preds + w2 * cb_preds + w3 * xgb_preds)

## Submitting

In [None]:
df_sub = pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
df_sub["target"] = final_preds
df_sub

In [None]:
df_sub.to_csv("submission.csv", index=False)