In [1]:
import torch
import numpy as np
import xgboost as xgb
from pathlib import Path
from sklearn.model_selection import StratifiedGroupKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import pandas as pd
from PIL import Image
import os
from torchvision.models import resnet50,resnet101
import numpy as np 
from sklearn.model_selection import train_test_split
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from torch.optim.lr_scheduler import OneCycleLR
from sklearn.metrics import roc_auc_score, classification_report,precision_recall_fscore_support,roc_curve, auc
from tqdm import tqdm
import albumentations as A
from pathlib import Path
import shutil
from typing import List, Dict, Optional, Generator
import logging
import json
from datetime import datetime
import matplotlib.pyplot as plt
import polars as pl
from sklearn.preprocessing import OneHotEncoder
import h5py
import timm

ModuleNotFoundError: No module named 'imblearn'

In [None]:
class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, drop_path_rate=0, drop_rate=0, pretrained=True, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model = timm.create_model(
            model_name, 
            pretrained=pretrained, 
            checkpoint_path=checkpoint_path,
            drop_rate=drop_rate, 
            drop_path_rate=drop_path_rate)
        
        in_features = self.model.num_features
        
        self.model.head = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, images):
        return self.model(images)
class FineTuneResNet101(nn.Module):
    def __init__(self, num_classes, freeze_layers=6):
        super().__init__()
        self.model = resnet101(weights=None)
        layers = list(self.model.named_children())
        for name, child in layers[:freeze_layers]:
            for param in child.parameters():
                param.requires_grad = False
 
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
        print(self.get_layer_stats())
        
    def get_layer_stats(self):
        """Print trainable parameters per layer"""
        for name, child in self.model.named_children():
            trainable_params = sum(p.numel() for p in child.parameters() if p.requires_grad)
            total_params = sum(p.numel() for p in child.parameters())
            print(f"{name}: {trainable_params}/{total_params} trainable parameters")
            
    def forward(self, x):
        return self.model(x)
class FineTuneResNet(nn.Module):
    def __init__(self, num_classes, freeze_layers=6):
        super().__init__()
        self.model = resnet50(weights=None)
        layers = list(self.model.named_children())
        for name, child in layers[:freeze_layers]:
            for param in child.parameters():
                param.requires_grad = False
        for param in self.model.layer4.parameters():
            param.requires_grad = True
        for param in self.model.layer3.parameters():
            param.requires_grad = True
        num_ftrs = self.model.fc.in_features
        self.model.fc = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),  
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
        print(self.get_layer_stats())
        
    def get_layer_stats(self):
        """Print trainable parameters per layer"""
        for name, child in self.model.named_children():
            trainable_params = sum(p.numel() for p in child.parameters() if p.requires_grad)
            total_params = sum(p.numel() for p in child.parameters())
            print(f"{name}: {trainable_params}/{total_params} trainable parameters")
            
    def forward(self, x):
        return self.model(x)
from torchvision.models import swin_t

class FineTuneSwin(nn.Module):
    def __init__(self, num_classes, freeze_layers=6):
        super().__init__()
        self.model = swin_t(weights=None)
        
        layers = list(self.model.named_children())
        for name, child in layers[:freeze_layers]:
            for param in child.parameters():
                param.requires_grad = False

        num_ftrs = self.model.head.in_features
    
        self.model.head = nn.Sequential(
            nn.Linear(num_ftrs, 512),
            nn.BatchNorm1d(512),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        print(self.get_layer_stats())
        
    def get_layer_stats(self):
        for name, child in self.model.named_children():
            trainable_params = sum(p.numel() for p in child.parameters() if p.requires_grad)
            total_params = sum(p.numel() for p in child.parameters())
            print(f"{name}: {trainable_params}/{total_params} trainable parameters")
            
    def forward(self, x):
        return self.model(x)

In [None]:
def read_data(path, num_cols, cat_cols):
    err = 1e-8
    id_col = 'isic_id'
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()),
        )

        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
        .set_index(id_col)
    )

def preprocess(df_train, df_test, feature_cols, cat_cols):
    """
    Preprocess the data by encoding categorical variables
    """
    import pandas as pd
    
    feature_cols = feature_cols.copy()
    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])
    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train_encoded = encoder.transform(df_train[cat_cols])
    df_test_encoded = encoder.transform(df_test[cat_cols])
    
    for i, col in enumerate(new_cat_cols):
        df_train[col] = pd.Categorical(df_train_encoded[:, i])
        df_test[col] = pd.Categorical(df_test_encoded[:, i])
    
    for col in cat_cols:
        if col in feature_cols:
            feature_cols.remove(col)
    feature_cols.extend(new_cat_cols)
    
    return df_train, df_test, feature_cols, new_cat_cols


xgb_params = {
    'enable_categorical': True,
    'tree_method': 'hist',
    'random_state': 42,
    'learning_rate': 0.08501257473292347, 
    'lambda': 8.879624125465703, 
    'alpha': 0.6779926606782505, 
    'max_depth': 6, 
    'subsample': 0.6012681388711075, 
    'colsample_bytree': 0.8437772277074493, 
    'colsample_bylevel': 0.5476090898823716, 
    'colsample_bynode': 0.9928601203635129, 
    'scale_pos_weight': 3.29440313334688,
    "device": "cuda"
}

def get_image_predictions(models, dataloader, device, model_names, save_dir=None):
    """Get predictions from multiple image models and save individually"""
    
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        
    all_predictions = []
    for model, model_name in zip(models, model_names):
        filename = f'{save_dir}/{model_name}_predictions.npy'
        if os.path.exists(filename) and save_dir:
            predictions = np.load(filename)
            print(f"Loaded existing predictions for {model_name} from {filename}")
        else:
            model.eval()
            predictions = []
            with torch.no_grad():
                for images in tqdm(dataloader, desc=f'Getting predictions for {model_name}'):
                    images = images.to(device)
                    outputs = model(images)
                    probs = outputs.softmax(dim=1)[:, 1].cpu().numpy()
                    predictions.extend(probs)
            predictions = np.array(predictions)
            if save_dir:
                np.save(filename, predictions)
                print(f"Saved predictions for {model_name} to {filename}")
        
        all_predictions.append(predictions)
    
    return np.stack(all_predictions, axis=1) 

def prepare_feature_matrix(df, image_preds,feature_cols):
    """Combine metadata features with image predictions"""
    image_pred_df = pd.DataFrame(
        image_preds, 
        columns=[f'model_pred_{i}' for i in range(image_preds.shape[1])]
    )
    
    feature_cols = feature_cols + [f'model_pred_{i}' for i in range(image_preds.shape[1])]
    df = pd.concat([df, image_pred_df], axis=1)
    return df
def run_model_old_xgb(df_train, df_test, xgb_params, feature_cols, reduce=True, columns_to_drop=None, save_dir='xgboost_models'):
    """
    Train XGBoost models with cross-validation and optionally save them
    
    Args:
        df_train: Training DataFrame
        df_test: Test DataFrame
        xgb_params: XGBoost parameters
        feature_cols: List of feature columns
        reduce: Whether to return mean metric
        columns_to_drop: Columns to exclude from training
        save_dir: Directory to save models (None to skip saving)
    """
    import os
    import joblib
    
    columns_to_drop = [] if columns_to_drop is None else columns_to_drop
    metric_list = []
    models = []
    group_col = 'patient_id'
    
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
    
    for random_seed in range(1):
        random_seed = random_seed * 10 + 88
        tsp = StratifiedGroupKFold(5, shuffle=True, random_state=random_seed)
        
        for fold_n, (train_index, val_index) in tqdm(enumerate(tsp.split(df_train, y=df_train.target, groups=df_train[group_col]))):
            train_slice_x = df_train.iloc[train_index][[i for i in feature_cols if i not in columns_to_drop]]
            val_slice_x = df_train.iloc[val_index][[i for i in feature_cols if i not in columns_to_drop]]
            
            train_slice_y = df_train.iloc[train_index]['target']
            val_slice_y = df_train.iloc[val_index]['target']
            
            for col in [c for c in train_slice_x.columns if c.startswith('model_pred_')]:
                train_slice_x[col] = train_slice_x[col] + np.random.normal(loc=0, scale=0.1, size=train_slice_x.shape[0])
            
            xgb_model = Pipeline([
                ('sampler_1', RandomOverSampler(sampling_strategy=0.003, random_state=random_seed)),
                ('sampler_2', RandomUnderSampler(sampling_strategy=0.01, random_state=random_seed)),
                ('classifier', xgb.XGBClassifier(**xgb_params)),
            ])
            
            xgb_model.fit(train_slice_x, train_slice_y)
            preds = xgb_model.predict_proba(val_slice_x)[:, 1]
            metric = custom_metric_raw(preds, val_slice_y.values)
            metric_list.append(metric)
            models.append(xgb_model)
            
            if save_dir:
                model_path = os.path.join(save_dir, f'xgb_model_seed{random_seed}_fold{fold_n}.joblib')
                joblib.dump(xgb_model, model_path)
                print(f"Saved model to {model_path}")
    
    if reduce:
        return np.mean(metric_list), models
    else:
        return metric_list, models
def load_model(model_class,path , device='cuda'):
    """Load multiple PyTorch models from .pth files"""
    
    if model_class == 'FineTuneResNet101':
        model = FineTuneResNet101(num_classes = 2)
    elif model_class == 'FineTuneSwin':
        model = FineTuneSwin(num_classes = 2)
    elif model_class == 'eva_silu':
        model = ISICModel(
    model_name='eva02_small_patch14_224',
    num_classes=2,
    pretrained=False
)
    else:
        print(f'invalid model class {model_class}')
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model

class CombinedDataset(Dataset):
    """Dataset for reading images from either HDF5 or directory"""
    def __init__(self, df, transform=None, hdf5_path=None, image_dir=None):
        """
        Args:
            df: DataFrame with image IDs
            transform: Albumentations transforms
            hdf5_path: Path to HDF5 file (if using HDF5)
            image_dir: Path to image directory (if using individual files)
        """
        self.df = df
        self.transform = transform
        self.hdf5_path = hdf5_path
        self.image_dir = image_dir
        
        if hdf5_path and image_dir:
            raise ValueError("Specify either hdf5_path or image_dir, not both")
        if not (hdf5_path or image_dir):
            raise ValueError("Must specify either hdf5_path or image_dir")
            
        if hdf5_path:
            with h5py.File(hdf5_path, 'r') as f:
                self.available_ids = set(f.keys())
                missing_ids = set(df['isic_id']) - self.available_ids
                if missing_ids:
                    raise ValueError(f"Missing {len(missing_ids)} images in HDF5 file")
    
    def __len__(self):
        return len(self.df)
    
    def _load_hdf5_image(self, image_id):
        """Load single image from HDF5 file"""
        with h5py.File(self.hdf5_path, 'r') as f:
            binary_data = f[image_id][()]
            if isinstance(binary_data, str):
                binary_data = binary_data.encode()
            nparr = np.frombuffer(binary_data, np.uint8)
            image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    def _load_file_image(self, image_id):
        """Load single image from file system"""
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")
        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image not found: {image_path}")
        image = cv2.imread(image_path)
        return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['isic_id']
    
        if self.hdf5_path:
            image = self._load_hdf5_image(image_id)
        else:
            image = self._load_file_image(image_id)
        if self.transform:
            image = self.transform(image=image)['image']
            
        return image

def create_dataloaders(df, batch_size=32, hdf5_path=None, image_dir=None, num_workers=4):
    """
    Create dataloaders for either HDF5 or directory images
    """
    transform = A.Compose([
        A.Resize(224, 224),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
        ToTensorV2()
    ])
 
    dataset = CombinedDataset(
        df=df,
        transform=transform,
        hdf5_path=hdf5_path,
        image_dir=image_dir
    )
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    
    return dataloader
def custom_metric_raw(y_hat, y_true):
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)
    
    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])
    
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    
    return partial_auc

def get_model_predictions(models_list, df_test, feature_cols, columns_to_drop=None):
    columns_to_drop = [] if not columns_to_drop else columns_to_drop
    df_test_size = int(df_test.shape[0] // 2)
    predictions_tmp = None
    for model in models_list:
        preds_tmp = model.predict_proba(
                df_test[[i for i in feature_cols if i not in columns_to_drop]])[:, 1]

        preds_tmp = pd.DataFrame({"preds": preds_tmp})
        preds_tmp = preds_tmp['preds'].rank(pct=True)

        if predictions_tmp is None:
            predictions_tmp = preds_tmp.values
        else:
            predictions_tmp += preds_tmp.values

    predictions_tmp = predictions_tmp / len(models_list)
    return predictions_tmp
def custom_metric(estimator, X, y_true):
    y_hat = estimator.predict_proba(X)[:, 1]
    partial_auc = custom_metric_raw(y_hat, y_true)
    return partial_auc

In [None]:
import glob
import joblib
def main():
    num_cols = [
        'age_approx', 'clin_size_long_diam_mm', 'tbp_lv_A', 'tbp_lv_Aext', 
        'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 
        'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 
        'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 
        'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
        'tbp_lv_eccentricity', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence',
        'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
        'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
        'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
        'tbp_lv_z'
    ]
    cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 
                'tbp_lv_location_simple', 'attribution']
    
    feature_cols = num_cols + cat_cols 

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_paths = {
        'FineTuneResNet101': '/kaggle/input/resnet101_model.pth/pytorch/default/1/resnet101_model.pth',
        'FineTuneSwin': '/kaggle/input/swin/pytorch/default/1/swin_model.pth',
        'eva_silu': '/kaggle/input/evaa/pytorch/default/1/eva_silu_model.pth'
    }
    model_names = [name for name, _ in model_paths.items()]
    models = [load_model(modelclass, path).to(device) for modelclass, path in model_paths.items()]


    test_path = '/kaggle/input/isic-2024-challenge/test-metadata.csv'
    df_test = read_data(test_path, num_cols, cat_cols).reset_index()
    train_path = '/kaggle/input/isic-2024-challenge/train-metadata.csv'
    df_train = read_data(train_path, num_cols, cat_cols).reset_index()
    df_train, df_test, updated_feature_cols, new_cat_cols = preprocess(
        df_train, df_test, num_cols, cat_cols
    )
    feature_cols = updated_feature_cols
    # print(feature_cols)

    test_dataloader = create_dataloaders(
        df=df_test,
        hdf5_path='/kaggle/input/isic-2024-challenge/test-image.hdf5',
        batch_size=32
    )
    if models:
        test_preds = get_image_predictions(
            models, test_dataloader, device, model_names
        )
        df_test = prepare_feature_matrix(df_test, test_preds, feature_cols)
    print(df_test.columns)
    models = []
    xgb_model_dir = '/kaggle/input/xgb/pytorch/default/1' 
    for model_path in sorted(glob.glob(os.path.join(xgb_model_dir, '*.joblib'))):
        model = joblib.load(model_path)
        models.append(model)
        print(f"Loaded model from {model_path}")
    df_test = df_test.drop(columns=['patient_id'])

    feature_cols += ['model_pred_0', 'model_pred_1','model_pred_2']
    predictions = get_model_predictions(models, df_test, feature_cols)
    
    df_test['target'] = predictions
    df_test[['isic_id', 'target']].to_csv('submission.csv', index=False)
    print(df_test[['isic_id', 'target']])
    print("Submission saved to submission.csv")

if __name__ == "__main__":
    main()

conv1: 0/9408 trainable parameters
bn1: 0/128 trainable parameters
relu: 0/0 trainable parameters
maxpool: 0/0 trainable parameters
layer1: 0/215808 trainable parameters
layer2: 0/1219584 trainable parameters
layer3: 26090496/26090496 trainable parameters
layer4: 14964736/14964736 trainable parameters
avgpool: 0/0 trainable parameters
fc: 1182466/1182466 trainable parameters
None


  model.load_state_dict(torch.load(path, map_location=device))


features: 0/27517818 trainable parameters
norm: 0/1536 trainable parameters
permute: 0/0 trainable parameters
avgpool: 0/0 trainable parameters
flatten: 0/0 trainable parameters
head: 527106/527106 trainable parameters
None


In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()
In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

  self.pid = os.fork()
Getting predictions for FineTuneResNet101: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Getting predictions for FineTuneSwin: 100%|██████████

Index(['isic_id', 'patient_id', 'age_approx', 'sex', 'anatom_site_general',
       'clin_size_long_diam_mm', 'image_type', 'tbp_tile_type', 'tbp_lv_A',
       'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext',
       'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2',
       'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA',
       'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB',
       'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location',
       'tbp_lv_location_simple', 'tbp_lv_minorAxisMM',
       'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color',
       'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL',
       'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle',
       'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license',
       'onehot_0', 'onehot_1', 'onehot_2', 'onehot_3', 'onehot_4', 'onehot_5',
       'onehot_6', 'onehot_7', 'onehot_8', 'onehot