# 1. Overview

This is a notebook for training models to submit predictions to the "Diabetes Prediction Challenge" Kaggle competition ([playground-series-s5e12](https://www.kaggle.com/competitions/playground-series-s5e12)).

Synthetic data is used for this playground competition, and the objective is to, for each patient in the test set, predict the probability that the patient will be diagnosed with diabetes.

# 2. Setup

## 2.1 Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import copy
import optuna
import os
import hashlib as hl # for StackingEstimator
import inspect # for StackingEstimator
import random
import warnings
from catboost import CatBoostClassifier
from enum import Enum
from pathlib import Path # for StackingPredictionsRetriever
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression # for meta model
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from types import FunctionType
from xgboost import XGBClassifier, XGBRFClassifier

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

pd.set_option('display.max_colwidth', None) # Display full column content
pd.set_option('display.max_rows', None) # Display all rows
pd.set_option('display.width', 1000) # Set larger display width

## 2.2 Reproducibility

For reproducibility of results, an arbitrary number will be used for the random seed.

In [2]:
RANDOM_SEEDS = [11, 42]
random.seed(RANDOM_SEEDS[0])
np.random.seed(RANDOM_SEEDS[0])
torch.manual_seed(RANDOM_SEEDS[0])
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEEDS[0])
    torch.cuda.manual_seed_all(RANDOM_SEEDS[0])

## 2.3 Device

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 2.4 DataFrames

Read the data provided for the competition into dataframes.

In [4]:
INPUT_DIR = '/kaggle/input'
orig_train_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s5e12/train.csv'))
orig_test_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s5e12/test.csv'))

# set index
orig_train_data.set_index('id', inplace=True)
orig_test_data.set_index('id', inplace=True)

# target column
target_col = "diagnosed_diabetes"

# 3. Exploratory Data Analysis

In [5]:
# to skip the generation of plots (e.g. KDE) in this section that take time; set to False to generate the plots 
SKIP_PLOTS = True

In [6]:
orig_train_data.describe()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,50.359734,2.072411,80.230803,5.963695,7.0022,6.012733,25.874684,0.858766,116.294193,75.440924,70.167749,186.818801,53.823214,102.905854,123.08185,0.149401,0.18199,0.030324,0.623296
std,11.65552,1.048189,51.195071,1.463336,0.901907,2.022707,2.860705,0.03798,11.01039,6.825775,6.938722,16.730832,8.266545,19.022416,24.739397,0.356484,0.385837,0.171478,0.48456
min,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.68,91.0,51.0,42.0,117.0,21.0,51.0,31.0,0.0,0.0,0.0,0.0
25%,42.0,1.0,49.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,175.0,48.0,89.0,106.0,0.0,0.0,0.0,0.0
50%,50.0,2.0,71.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0,1.0
75%,58.0,3.0,96.0,7.0,7.6,7.4,27.8,0.88,124.0,80.0,75.0,199.0,59.0,116.0,139.0,0.0,0.0,0.0,1.0
max,89.0,9.0,747.0,9.9,9.9,16.5,38.4,1.05,163.0,104.0,101.0,289.0,90.0,205.0,290.0,1.0,1.0,1.0,1.0


In [7]:
orig_test_data.describe()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,50.432397,2.089693,92.349087,5.945838,6.997795,6.011278,25.881906,0.859007,116.374117,75.396013,70.04835,187.30862,53.813557,103.416083,123.53848,0.15292,0.18441,0.03311
std,11.938741,1.066214,62.187399,1.481068,0.914693,2.060472,2.894289,0.038523,11.252146,6.95034,7.090543,18.413053,8.398126,20.571855,28.965441,0.359911,0.387819,0.178924
min,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.69,91.0,51.0,42.0,107.0,22.0,51.0,31.0,0.0,0.0,0.0
25%,42.0,1.0,51.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,174.0,48.0,89.0,104.0,0.0,0.0,0.0
50%,50.0,2.0,77.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0
75%,59.0,3.0,115.0,7.0,7.6,7.4,27.8,0.89,124.0,80.0,75.0,200.0,60.0,117.0,142.0,0.0,0.0,0.0
max,89.0,9.0,748.0,9.9,9.9,15.9,38.3,1.05,170.0,104.0,101.0,285.0,91.0,226.0,290.0,1.0,1.0,1.0


In [8]:
numeric_col_names = orig_train_data.select_dtypes(include='number').columns.to_series()
categorical_col_names = orig_train_data.select_dtypes(include='object').columns.to_series()
assert numeric_col_names.size + categorical_col_names.size == orig_train_data.shape[1]

# drop target column from numeric column names
numeric_col_names.drop(target_col, inplace=True)

In [9]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} missing values #####")
    print(dataset.isnull().sum())
    print()

##### Train data missing values #####
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
di

In [10]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} categorical cols unique values #####")
    for categorical_col_name in categorical_col_names:
        print(f"{categorical_col_name}:")
        print(dataset[categorical_col_name].unique())
    print()

##### Train data categorical cols unique values #####
gender:
['Female' 'Male' 'Other']
ethnicity:
['Hispanic' 'White' 'Asian' 'Black' 'Other']
education_level:
['Highschool' 'Graduate' 'Postgraduate' 'No formal']
income_level:
['Lower-Middle' 'Upper-Middle' 'Low' 'Middle' 'High']
smoking_status:
['Current' 'Never' 'Former']
employment_status:
['Employed' 'Retired' 'Student' 'Unemployed']

##### Test data categorical cols unique values #####
gender:
['Female' 'Male' 'Other']
ethnicity:
['White' 'Hispanic' 'Black' 'Asian' 'Other']
education_level:
['Highschool' 'Graduate' 'Postgraduate' 'No formal']
income_level:
['Middle' 'Low' 'Lower-Middle' 'Upper-Middle' 'High']
smoking_status:
['Former' 'Never' 'Current']
employment_status:
['Employed' 'Unemployed' 'Retired' 'Student']



In [11]:
# KDE plots of target variable and numerical features
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 24))
    kdeplot_col_names = [target_col]
    kdeplot_col_names.extend(numeric_col_names)
    for i, col in enumerate(kdeplot_col_names, start=1):
        plt.subplot(10, 2, i)
        sns.kdeplot(data=orig_train_data, x=col, fill=True)
        plt.tight_layout()
        plt.title(f"KDE plot of {col}")

In [12]:
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        orig_train_data[numeric_col_names].corr(),
        cmap='Reds',
        annot=True,
        linewidths=2,
        fmt='.2f',
        vmin=-1,
        vmax=1
    )
    plt.title('Correlation Matrix of Numerical Features', fontsize=18, pad=20)
    plt.show()

# 4. Data Pre-Processing

In [13]:
train_data = orig_train_data.copy()
test_data = orig_test_data.copy()

## 4.1 Ordinal Encoding

In [14]:
# education level
education_level_encoder = OrdinalEncoder(categories=[['No formal', 'Highschool', 'Graduate', 'Postgraduate']])
train_data['education_level_encoded'] = education_level_encoder.fit_transform(train_data[['education_level']])
test_data['education_level_encoded'] = education_level_encoder.fit_transform(test_data[['education_level']])

# income level
income_level_encoder = OrdinalEncoder(categories=[['Low', 'Lower-Middle','Middle', 'Upper-Middle', 'High']])
train_data['income_level_encoded'] = income_level_encoder.fit_transform(train_data[['income_level']])
test_data['income_level_encoded'] = income_level_encoder.fit_transform(test_data[['income_level']])

# smoking status
smoking_status_encoder = OrdinalEncoder(categories=[['Never', 'Former', 'Current']])
train_data['smoking_status_encoded'] = smoking_status_encoder.fit_transform(train_data[['smoking_status']])
test_data['smoking_status_encoded'] = smoking_status_encoder.fit_transform(test_data[['smoking_status']])

# drop original cols
for col in ['income_level', 'education_level', 'smoking_status']:
    train_data.drop(col, axis=1, inplace=True)
    test_data.drop(col, axis=1, inplace=True)

# print out value maps to check assigned values are as expected
for (encoded_col_name, encoder) in [
    ('education_level_encoded', education_level_encoder),
    ('income_level_encoded', income_level_encoder),
    ('smoking_status_encoded', smoking_status_encoder),
]:
    categories = encoder.categories_[0]
    value_map = { category: i for i, category in enumerate(categories) }
    print(f"{encoded_col_name}:\n{value_map}")

education_level_encoded:
{'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
income_level_encoded:
{'Low': 0, 'Lower-Middle': 1, 'Middle': 2, 'Upper-Middle': 3, 'High': 4}
smoking_status_encoded:
{'Never': 0, 'Former': 1, 'Current': 2}


## 4.3 Data Cleaning

In [15]:
def fix_blood_pressure(df):
    mask = df['diastolic_bp'] > df['systolic_bp']
    df.loc[mask, ['systolic_bp', 'diastolic_bp']] = (
        df.loc[mask, ['diastolic_bp', 'systolic_bp']].values
    )
    return df

train_data = fix_blood_pressure(train_data)
test_data = fix_blood_pressure(test_data)

## 4.4 Feature Generation

In [16]:
def add_generated_features(df):
    # log transforms for skewed data
    for col in ['triglycerides', 'ldl_cholesterol', 'cholesterol_total']:
        df[f'log_{col}'] = np.log1p(df[col])

    # medical ratios & interactions
    df['cholesterol_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1e-5)
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['mean_arterial_pressure'] = (df['systolic_bp'] + 2 * df['diastolic_bp']) / 3
    df['age_x_bmi'] = df['age'] * df['bmi']
    df['waist_x_bmi'] = df['waist_to_hip_ratio'] * df['bmi']
    df['family_history_diabetes_x_log_triglycerides'] = df['family_history_diabetes'] * df['log_triglycerides']
    df['hypertension_history_x_systolic_bp'] = df['hypertension_history'] * df['systolic_bp']
    df['activity_x_diet'] = df['physical_activity_minutes_per_week'] * df['diet_score']

    # squared
    df['age_sq'] = df['age'] ** 2
    df['bmi_sq'] = df['bmi'] ** 2
    df['waist_to_hip_ratio_sq'] = df['waist_to_hip_ratio'] ** 2
    df['systolic_bp_sq'] = df['systolic_bp'] ** 2

    # risk grouping
    df['comorbidity_count'] = (
        df['hypertension_history'] + 
        df['cardiovascular_history'] + 
        df['family_history_diabetes']
    )

    # binning
    df['bmi_cat'] = pd.cut(df['bmi'], bins=[-1, 25, 30, 100], labels=[0, 1, 2]).astype(int)
    bmi_cat_encoder = OrdinalEncoder(categories=[[0, 1, 2]])
    df['bmi_cat_encoded'] = bmi_cat_encoder.fit_transform(df[['bmi_cat']])

In [17]:
def add_kmeans_features(train_df, test_df, n_clusters):
    features_to_cluster = [
        'age', 'bmi', 'mean_arterial_pressure', 'cholesterol_ratio', 'log_triglycerides'
    ]
    
    combined = pd.concat([train_df[features_to_cluster], test_df[features_to_cluster]], axis=0)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(combined)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=RANDOM_SEEDS[0], n_init=10)
    clusters = kmeans.fit_predict(scaled_data)

    train_df['cluster_label'] = clusters[:len(train_df)].astype(object)
    test_df['cluster_label'] = clusters[len(train_df):].astype(object)

    return train_df, test_df

In [18]:
# add generated features
add_generated_features(train_data)
add_generated_features(test_data)

# apply clustering
train_data, test_data = add_kmeans_features(train_data, test_data, n_clusters=7)

In [19]:
train_data.columns

Index(['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'gender', 'ethnicity', 'employment_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'diagnosed_diabetes', 'education_level_encoded', 'income_level_encoded', 'smoking_status_encoded', 'log_triglycerides', 'log_ldl_cholesterol', 'log_cholesterol_total', 'cholesterol_ratio', 'ldl_hdl_ratio', 'pulse_pressure', 'mean_arterial_pressure', 'age_x_bmi', 'waist_x_bmi', 'family_history_diabetes_x_log_triglycerides', 'hypertension_history_x_systolic_bp', 'activity_x_diet', 'age_sq', 'bmi_sq', 'waist_to_hip_ratio_sq', 'systolic_bp_sq', 'comorbidity_count', 'bmi_cat', 'bmi_cat_encoded', 'cluster_label'], dtype='object')

## 4.4 Remaining Categorical Features

In [20]:
cat_features = train_data.drop(target_col, axis=1).select_dtypes(include='object').columns.to_list()
if len(cat_features) > 0:
    for col in cat_features:
        train_data[col] = train_data[col].astype('category')
        test_data[col] = test_data[col].astype('category')

## 4.5 MLP Data Preparation

Since MLP cannot handle categorical features and requires the data to be scaled, the training and test data that will be used for it are prepared as separate dataframes.

In [21]:
def prepare_mlp_data(df, cat_features):
    df = df.copy()
    df = pd.get_dummies(df, columns=cat_features, drop_first=True, dtype=int)
    cols_to_scale = [c for c in df.columns if c != 'diagnosed_diabetes']
    scaler = StandardScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    return df

train_data_mlp = prepare_mlp_data(train_data, cat_features)
test_data_mlp = prepare_mlp_data(test_data, cat_features)

In [22]:
train_data_mlp.columns

Index(['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'diagnosed_diabetes', 'education_level_encoded', 'income_level_encoded', 'smoking_status_encoded', 'log_triglycerides', 'log_ldl_cholesterol', 'log_cholesterol_total', 'cholesterol_ratio', 'ldl_hdl_ratio', 'pulse_pressure', 'mean_arterial_pressure', 'age_x_bmi', 'waist_x_bmi', 'family_history_diabetes_x_log_triglycerides', 'hypertension_history_x_systolic_bp', 'activity_x_diet', 'age_sq', 'bmi_sq', 'waist_to_hip_ratio_sq', 'systolic_bp_sq', 'comorbidity_count', 'bmi_cat', 'bmi_cat_encoded', 'gender_Male', 'gender_Other', 'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White',
       'employmen

# 5. Stacking Initial Setup

We'll use stacking, an [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) strategy, to generate the predictions. As we'll need to gather predictions from various base models (a.k.a. level-0 models) to feed as input features to a meta model (a.k.a. level-1 model), in order to streamline the process of experimenting with different combinations of base models, some helper classes will be defined in this section. These classes can also be found [here](https://github.com/chuo-v/machine-learning-utils/blob/master/ensemble-learning/stacking/stacking_predictions_retriever.py) at one of my GitHub repositories used to organize some utilities I implemented for machine learning.

In [23]:
class StackingEstimator:
    """
    A class representing an estimator that will be used for stacking, an ensemble learning strategy.

    Intended to be used in conjunction with the `StackingPredictionsRetriever` class, which helps
    retrieve predictions for multiple instances of `StackingEstimator`; as the predictions are saved
    in files, on subsequent requests to retrieve predictions, even as the set of estimators has been
    modified, the `StackingPredictionsRetriever` class can determine the predictions of estimators
    that are non-stale and available (if any) by using the `get_hash` method of the `StackingEstimator`
    class to determine the relevance and staleness of any saved predictions.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimator are determinstic, i.e. they are exactly the same everytime the estimator is
    run with the same inputs (`name`, `params_dict`, `feature_names`, `get_predictions`).
    """
    name = ""
    params_dict = {}
    feature_names = []
    get_predictions = lambda: None

    def __init__(
        self,
        name: str,
        feature_names: [str],
        params_dict: {},
        get_preds: FunctionType
    ):
        """
        Initializes a new instance of `StackingEstimator`.

        :param name:
            A string representing a name for the estimator. It is used for the column names of
            the training and test predictions for each estimator, and is also used as an input
            to calculate a hash value for the estimator. It is recommended to use a different
            name from the names used for other estimators passed to `StackingPredictionsRetriever`.
        :param feature_names:
            A list of strings representing the names of the features that will be used for the
            estimator. It will be passed as an argument to `get_preds`. Internally, it is only
            used as an input to calculate a hash value for the estimator.
        :param params_dict:
            A dictionary of parameters that will be specified for the estimator. It will be
            passed as an argument to `get_preds`. Internally, it is only used as an input
            to calculate a hash value for the estimator.
        :param get_preds:
            A function for getting the predictions for the estimator. It should only take two
            arguments: 'params_dict' and 'feature_names', and should return predictions for
            the training and test data (in that order) as a tuple of two `pandas.Series`.
        """
        # parameter check
        if not isinstance(name, str):
            raise ValueError("`name` argument should be of type `str`")
        if not isinstance(feature_names, list):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should be of type `list`")
        elif not all(isinstance(feature_name, str) for feature_name in feature_names):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should only contain instances of `str`")
        if not isinstance(params_dict, dict):
            raise ValueError(f"`params_dict` argument for estimator \"{name}\" should be of type `dict`")
        get_preds_params = inspect.signature(get_preds).parameters.values()
        get_preds_param_names = [param.name for param in get_preds_params]
        if len(get_preds_param_names) != 2:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take two arguments")
        elif "params_dict" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"params_dict\" argument")
        elif "feature_names" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"feature_names\" argument")

        self.name = name
        self.feature_names = feature_names
        self.params_dict = params_dict
        self.get_preds = get_preds

    def get_hash_value(self):
        """
        Calculates and returns a hash value for the estimator using
        `name`, `feature_names` and `params_dict` as inputs.
        """
        feature_names_str = "_".join(sorted(self.feature_names))
        params_dict_str = "_".join(f"{key}-{value}" for (key, value) in sorted(self.params_dict.items()))
        hash_input_str = "_".join([self.name, feature_names_str, params_dict_str])
        md5_hash = hl.md5(hash_input_str.encode('utf-8')).hexdigest()
        return md5_hash

class StackingPredictionsRetriever:
    """
    A class for streamlining stacking (an ensemble learning strategy) that saves predictions
    from estimators to file so that when trying out different combinations of (base) estimators,
    the predictions that are not stale can be reused, saving the time of having the estimators
    make predictions again.

    Intended to be used in conjunction with the `StackingEstimator` class. The `hash_value` of
    `StackingEstimator` is used to determine the staleness and relevance of the predictions for
    an estimator. The implementation for making predictions using an estimator needs to be
    provided as a function to `get_preds` for `StackingEstimator`; when predictions need to be
    made using an estimator, this class will call `get_preds` for the `StackingEstimator` instance.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimators are determinstic, i.e. they are exactly the same everytime a
    `StackingEstimator` instance is run with the same inputs.
    """
    estimators = []
    working_dir_path = ""
    train_preds_filename = ""
    test_preds_filename = ""
    preds_save_interval = 0

    def __init__(
        self,
        estimators: [StackingEstimator],
        working_dir_path: str,
        train_preds_filename: str = "train_preds",
        test_preds_filename: str = "test_preds",
        preds_save_interval: int = 5,
    ):
        """
        Initializes a new instance of `StackingPredictionsRetriever`.

        :param estimators:
            A list of `StackingEstimator` instances for which the class will retrieve predictions.
        :param working_dir_path:
            The path for the working directory where the files with predictions will be saved.
        :param train_preds_filename:
            The name of the file in which predictions for the training set will be stored.
        :param test_preds_filename:
            The name of the file in which predictions for the test set will be stored.
        :param preds_save_interval:
            An integer which specifies the interval at which predictions will be saved when
            `get_preds` is called, corresponding to the number of estimators whose predictions
            have been retrieved since the predictions were previously saved. Any estimators
            whose predictions are not stale and therefore were not required to make predictions
            again are not included in this number.
        """
        # parameter check
        if not isinstance(estimators, list):
            raise ValueError("`estimators` must be passed as a list")
        if not all(isinstance(e, StackingEstimator) for e in estimators):
            raise ValueError("`estimators` should only contain instances of `StackingEstimator`")
        if not isinstance(working_dir_path, str):
            raise ValueError("`working_dir_path` argument should be of type `str`")
        if not isinstance(preds_save_interval, int):
            raise ValueError("`preds_save_interval` argument should be of type `int`")

        self.estimators = estimators
        self.working_dir_path = working_dir_path
        self.train_preds_filename = train_preds_filename
        self.test_preds_filename = test_preds_filename
        self.preds_save_interval = preds_save_interval

    def get_train_preds_file_path(self):
        """
        Returns the file path for storing predictions for training data.
        """
        return Path(f"{self.working_dir_path}/{self.train_preds_filename}.csv")

    def get_test_preds_file_path(self):
        """
        Returns the file path for storing predictions for test data.
        """
        return Path(f"{self.working_dir_path}/{self.test_preds_filename}.csv")

    def get_current_train_and_test_preds(self):
        """
        Returns the current predictions for training and test data (in that order)
        as a tuple of two `pandas.DataFrame`.

        The predictions are attempted to be retrieved from the file paths returned
        by `get_train_preds_file_path` and `get_test_preds_file_path`; if there are
        any issues with doing so (e.g. file does not exist, dataframe is empty),
        empty dataframes will be returned instead.
        In the case an `pandas.errors.EmptyDataError` exception is raised when
        reading from a file, the corresponding file will be removed.
        """
        curr_train_preds = pd.DataFrame()
        curr_test_preds = pd.DataFrame()
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            try:
                curr_train_preds = pd.read_csv(train_preds_file_path)
            except pd.errors.EmptyDataError:
                train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            try:
                curr_test_preds = pd.read_csv(test_preds_file_path)
            except pd.errors.EmptyDataError:
                test_preds_file_path.unlink()

        return curr_train_preds, curr_test_preds

    def get_preds(self):
        """
        Retrieves predictions from all estimators in `estimators`, storing them in
        two files at the file paths specified by `working_dir_path`,
        `train_preds_filename` and `test_preds_filename`.

        If non-stale (relevant) predictions are found for an estimator, retrieval
        of predictions by calling `get_preds` on the estimator will be skipped,
        and the existing predictions for the estimator will be kept.
        """
        print("[INFO] Getting predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        preds_retrieved_count = 0
        num_preds_retrieved_but_not_yet_saved = 0
        estimators_skipped = []

        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"

            # skip retrieving predictions for estimator if non-stale predictions are already available
            train_preds_available = any(estimator_hash_value in col_name for col_name in curr_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in curr_test_preds.columns)
            if train_preds_available and test_preds_available:
                estimators_skipped += [estimator_name]
                continue

            print(f"[INFO] Getting predictions for estimator {estimator_name}")
            train_preds, test_preds = estimator.get_preds(estimator.params_dict, estimator.feature_names)
            if not isinstance(train_preds, pd.core.series.Series):
                raise ValueError("`train_preds` should be of type `pandas.Series`")
            if not isinstance(test_preds, pd.core.series.Series):
                raise ValueError("`test_preds` should be of type `pandas.Series`")
            curr_train_preds[estimator_name] = train_preds
            curr_test_preds[estimator_name] = test_preds
            preds_retrieved_count += 1

            # save predictions at an interval of `preds_save_interval`
            if preds_retrieved_count % self.preds_save_interval == 0:
                curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
                curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
                num_preds_retrieved_but_not_yet_saved = 0
                print("[INFO] Saved predictions")
            else:
                num_preds_retrieved_but_not_yet_saved += 1

        if estimators_skipped:
            estimators_skipped.sort()
            formatted_estimators = ", ".join(estimators_skipped)
            print(f"[INFO] Skipped retrieving predictions for following estimators as their current ones are not stale:\n{formatted_estimators}")

        if num_preds_retrieved_but_not_yet_saved != 0:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            print("[INFO] Saved predictions")

        print("[INFO] Finished getting all predictions")

    def sync_preds(self):
        """
        Syncs the predictions stored at the two file paths specified by
        `working_dir_path`, `train_preds_filename` and `test_preds_filename` by
        removing predictions for any estimator that is not currently in `estimators`.

        Note that new predictions for estimators that do not currently have predictions
        in the files will not be added; `get_preds` should be used for this purpose
        instead.
        """
        print("[INFO] Syncing predictions..")
        estimator_hash_values = [estimator.get_hash_value() for estimator in self.estimators]
        should_remove_col = lambda col_name: not any(hash_value in col_name for hash_value in estimator_hash_values)

        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        if not curr_train_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_train_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from training predictions:\n{col_names_to_remove}")
                curr_train_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_train_preds.to_csv(self.get_train_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for training predictions were dropped")
        if not curr_test_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_test_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from test predictions:\n{col_names_to_remove}")
                curr_test_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_test_preds.to_csv(self.get_test_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for test predictions were dropped")

        print("[INFO] Finished syncing predictions")

    def import_preds(self, input_dir_path):
        """
        Imports predictions stored at the two file paths at `input_dir_path` with
        `train_preds_filename` and `test_preds_filename` as their filenames. If no
        such files are found, no predictions will be imported.

        Only predictions for estimators specified in `estimators` will be imported.
        Any predictions for estimators that were already available will be overwritten
        with predictions for the same estimators found in the files at `input_dir_path`.

        :param input_dir_path:
            The path to the directory for the training and test predictions files.
            The file names are expected to be the same as `train_preds_filename`
            and `test_preds_filename`
        """
        print("[INFO] Importing predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()
        input_train_preds = pd.DataFrame()
        input_test_preds = pd.DataFrame()

        input_train_preds_path = Path(f"{input_dir_path}/{self.train_preds_filename}.csv")
        input_test_preds_path = Path(f"{input_dir_path}/{self.test_preds_filename}.csv")
        if input_train_preds_path.is_file():
            try:
                input_train_preds = pd.read_csv(input_train_preds_path)
            except: pass
        if input_test_preds_path.is_file():
            try:
                input_test_preds = pd.read_csv(input_test_preds_path)
            except: pass

        estimators_with_imported_train_preds = []
        estimators_with_imported_test_preds = []
        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"
            train_preds_available = any(estimator_hash_value in col_name for col_name in input_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in input_test_preds.columns)

            if train_preds_available:
                curr_train_preds[estimator_name] = input_train_preds[estimator_name]
                estimators_with_imported_train_preds += [estimator_name]
            if test_preds_available:
                curr_test_preds[estimator_name] = input_test_preds[estimator_name]
                estimators_with_imported_test_preds += [estimator_name]

        if not estimators_with_imported_train_preds:
            print("[INFO] No train predictions were imported")
        else:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_train_preds)
            print(f"[INFO] {len(estimators_with_imported_train_preds)} train predictions were imported:\n{formatted_estimators}")
        if not estimators_with_imported_test_preds:
            print("[INFO] No test predictions were imported")
        else:
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_test_preds)
            print(f"[INFO] {len(estimators_with_imported_test_preds)} test predictions were imported:\n{formatted_estimators}")
        
        print("[INFO] Finished importing predictions")

    def clear_preds(self):
        """
        Removes all stored predictions by deleting the two files at filepaths specified
        by `working_dir_path`, `train_preds_filename` and `test_preds_filename`.
        """
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            test_preds_file_path.unlink()

        print("[INFO] Finished clearing predictions")

Next, we'll simply create a variable for storing the estimators (`StackingEstimator` instances) that we'll pass to the `StackingPredictionsRetriever` class for getting all the predictions from our base models.

In [24]:
estimators = []

# 6. Base Model Definitions

Custom implementations of some of the base models that require them can be found in this section.

## 6.1 MLPClassifier

In [25]:
class MLPClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, hidden_layers, dropout, learning_rate, batch_size, weight_decay, epochs, device, patience=10):
        self.input_dim = input_dim
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.weight_decay = weight_decay
        self.epochs = epochs
        self.device = device
        self.patience = patience
        self.model = None

    def build_model(self):
        layers = []
        in_dim = self.input_dim
        for h_dim in self.hidden_layers:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(self.dropout))
            in_dim = h_dim
        layers.append(nn.Linear(in_dim, 1))
        layers.append(nn.Sigmoid())
        return nn.Sequential(*layers).to(self.device)

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        X_train_t = torch.FloatTensor(X_train.values).to(self.device)
        y_train_t = torch.FloatTensor(y_train.values).to(self.device).unsqueeze(1)
        train_dataset = TensorDataset(X_train_t, y_train_t)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        val_loader = None
        if X_val is not None and y_val is not None:
            X_val_t = torch.FloatTensor(X_val.values).to(self.device)
            y_val_t = torch.FloatTensor(y_val.values).to(self.device).unsqueeze(1)
            val_dataset = TensorDataset(X_val_t, y_val_t)
            val_loader = DataLoader(val_dataset, batch_size=self.batch_size*2, shuffle=False)

        self.model = self.build_model()
        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)

        best_val_loss = float('inf')
        patience_counter = 0
        best_model_state = None

        self.model.train()
        
        for epoch in range(self.epochs):
            running_loss = 0.0
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = self.model(X_batch)
                loss = criterion(y_pred, y_batch)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

            if val_loader:
                self.model.eval() # switch to eval mode (disable dropout)
                val_loss = 0.0
                with torch.no_grad():
                    for X_v, y_v in val_loader:
                        val_pred = self.model(X_v)
                        val_loss += criterion(val_pred, y_v).item()
                
                # check for improvement
                avg_val_loss = val_loss / len(val_loader)
                
                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    patience_counter = 0
                    # save the best model weights
                    best_model_state = copy.deepcopy(self.model.state_dict())
                else:
                    patience_counter += 1
                    
                self.model.train() # switch back to train mode

                if patience_counter >= self.patience:
                    break

        if best_model_state:
            self.model.load_state_dict(best_model_state)
            
        return self

    def predict_proba(self, X):
        self.model.eval()
        X_t = torch.FloatTensor(X.values).to(self.device)
        with torch.no_grad():
            preds = self.model(X_t).cpu().numpy()
        return np.column_stack((1 - preds, preds))

# 7. Base Model Hyperparameter Tuning

In [26]:
# to skip hyperparameter tuning when it's not needed; set to `False` to do the tuning
SKIP_BASE_MODEL_HYPERPARAMETER_TUNING = True

# value set for early stopping for base models that support it; this value will be used for actual model training as well
BASE_MODEL_EARLY_STOPPING_ROUNDS = 100

In [27]:
class BaseModelOptunaStudyEstimator(Enum):
    CATBOOSTCLASSIFIER = "CatBoostClassifier"
    XGBCLASSIFIER = "XGBClassifier"
    XGBRFCLASSIFIER = "XGBRFClassifier"
    MLPCLASSIFIER = "MLPClassifier"

Manually configure the values for the following variables for different studies.

In [28]:
# estimator to use for Optuna study
BASE_MODEL_OPTUNA_STUDY_ESTIMATOR = BaseModelOptunaStudyEstimator.XGBCLASSIFIER

# maximum number of trials Optuna will conduct for the optimization
BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS = 150

# number of splits to use for Stratified K-Fold Cross-Validation for Optuna study
BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS = 3

In [29]:
def get_base_model_optuna_params(trial, study_estimator):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOSTCLASSIFIER:
        return {
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
            "depth": trial.suggest_int("depth", 3, 10),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 30),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 30),
            'random_strength': trial.suggest_float('random_strength', 0, 20),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
        }
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBCLASSIFIER:
        return {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
            'max_depth': trial.suggest_int('max_depth', 4, 5),
            'subsample': trial.suggest_float('subsample', 0.5, 0.7),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
            'alpha': trial.suggest_float('alpha', 1e-3, 1.0, log=True),
            'gamma': trial.suggest_float('gamma', 0.0, 0.5),
            'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 10, 50),
        }
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBRFCLASSIFIER:
        return {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 10, 25),
            'subsample': trial.suggest_float('subsample', 0.4, 0.9),
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.4, 0.9),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        }
    elif study_estimator == BaseModelOptunaStudyEstimator.MLPCLASSIFIER:
        return {
            'hidden_layers': trial.suggest_categorical('hidden_layers', [
                (128, 64), 
                (256, 128),
                (512, 256, 128),
                (128, 64, 32)
            ]),
            'dropout': trial.suggest_float('dropout', 0.1, 0.4),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
            'batch_size': trial.suggest_categorical('batch_size', [512, 1024, 2048]),
            'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-4, log=True),
        }
    else:
        raise ValueError("Unsupported optuna study estimator")

def get_base_model_predictions(study_estimator, trial_params, X_train_fold, y_train_fold, X_validation_fold, y_validation_fold):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOSTCLASSIFIER:
        model = CatBoostClassifier(
            **trial_params,
            iterations=30000,
            use_best_model=True,
            cat_features=cat_features,
            loss_function='Logloss',
            eval_metric='AUC',
            task_type='GPU' if torch.cuda.is_available() else 'CPU',
            devices='0',
            metric_period=1000,
            random_seed=RANDOM_SEEDS[0],
            verbose=False,
            allow_writing_files=False
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_validation_fold, y_validation_fold),
            early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS
        )
        return model.predict_proba(X_validation_fold)[:, 1]
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBCLASSIFIER:
        model = XGBClassifier(
            **trial_params,
            n_estimators=30000,
            tree_method='hist' if torch.cuda.is_available() else 'auto',
            device='cuda' if torch.cuda.is_available() else 'cpu',
            enable_categorical=True,
            objective='binary:logistic',
            eval_metric='auc',
            n_jobs=-1,
            random_state=RANDOM_SEEDS[0],
            verbosity=0
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_validation_fold, y_validation_fold)],
            early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS,
            verbose=False
        )
        return model.predict_proba(X_validation_fold)[:, 1]
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBRFCLASSIFIER:
        model = XGBRFClassifier(
            **trial_params,
            tree_method='hist' if torch.cuda.is_available() else 'auto',
            device='cuda' if torch.cuda.is_available() else 'cpu',
            enable_categorical=True,
            learning_rate=1.0,
            n_jobs=-1,
            random_state=RANDOM_SEEDS[0],
            verbose=0
        )
        model.fit(X_train_fold, y_train_fold)
        return model.predict_proba(X_validation_fold)[:, 1]
    elif study_estimator == BaseModelOptunaStudyEstimator.MLPCLASSIFIER:
        model = MLPClassifier(
            **trial_params,
            input_dim=X_train_fold.shape[1],
            epochs=100,
            patience=10,
            device=DEVICE
        )
        model.fit(
            X_train_fold, y_train_fold,
            X_val=X_validation_fold, y_val=y_validation_fold
        )
        return model.predict_proba(X_validation_fold)[:, 1]
    else:
        raise ValueError("Unsupported optuna study estimator")

def base_model_optuna_study_objective(trial):
    base_model_params = get_base_model_optuna_params(trial, BASE_MODEL_OPTUNA_STUDY_ESTIMATOR)

    if BASE_MODEL_OPTUNA_STUDY_ESTIMATOR == BaseModelOptunaStudyEstimator.MLPCLASSIFIER:
        optuna_train_data = train_data_mlp
    else:
        optuna_train_data = train_data

    base_model_optuna_study_skf = StratifiedKFold(n_splits=BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS, shuffle=True, random_state=RANDOM_SEEDS[0])
    base_model_optuna_study_skf_splits = base_model_optuna_study_skf.split(optuna_train_data.drop(target_col, axis=1), optuna_train_data[target_col])
    base_model_optuna_study_skf_enumeration = enumerate(base_model_optuna_study_skf_splits)

    total_roc_auc = 0

    for fold, (train_indices, validation_indices) in base_model_optuna_study_skf_enumeration:
        X_train_fold = optuna_train_data.drop(target_col, axis=1).iloc[train_indices]
        X_validation_fold = optuna_train_data.drop(target_col, axis=1).iloc[validation_indices]
        y_train_fold = optuna_train_data[target_col].iloc[train_indices]
        y_validation_fold = optuna_train_data[target_col].iloc[validation_indices]

        y_validation_pred_proba = get_base_model_predictions(
            BASE_MODEL_OPTUNA_STUDY_ESTIMATOR,
            base_model_params,
            X_train_fold, y_train_fold,
            X_validation_fold, y_validation_fold
        )
        roc_auc_fold = roc_auc_score(y_validation_fold, y_validation_pred_proba)
        total_roc_auc += roc_auc_fold

        trial.report(roc_auc_fold, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    average_roc_auc = total_roc_auc / BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS
    return average_roc_auc

In [30]:
if SKIP_BASE_MODEL_HYPERPARAMETER_TUNING:
    print("Skipped base model hyperparameter tuning")
else:
    print(f"Started base model hyperparameter tuning for {BASE_MODEL_OPTUNA_STUDY_ESTIMATOR.value}")
    sampler = optuna.samplers.TPESampler(n_ei_candidates=48, multivariate=True)
    study = optuna.create_study(sampler=sampler, direction="maximize")
    study.optimize(base_model_optuna_study_objective, n_trials=BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS)
    
    print(f"# trials finished: {len(study.trials)}")
    trial = study.best_trial
    print(f"Best trial AUC: {trial.value}")
    print(f"Best trial params:")
    for param_key, param_value in trial.params.items():
        print(f"- {param_key}: {param_value}")

Skipped base model hyperparameter tuning


# 8. Base Models

In [31]:
# number of splits to use for Stratified K-Fold Cross-Validation for base models
BASE_MODEL_KFOLD_NUM_SPLITS = 5

## 8.1 CatBoostClassifier

### 8.1.1 Helper Methods (CatBoostClassifier)

In [32]:
def get_catboostclassifier_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data))
    test_preds_accumulator = np.zeros(len(test_data))

    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        skf_splits = skf.split(train_data.drop(target_col, axis=1), train_data[target_col])
        skf_enumeration = enumerate(skf_splits)

        seed_oof_preds = np.zeros(len(train_data))
    
        for fold, (train_indices, validation_indices) in skf_enumeration:
            X_train_fold = train_data.drop(target_col, axis=1).iloc[train_indices]
            X_validation_fold = train_data.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data[target_col].iloc[train_indices]
            y_validation_fold = train_data[target_col].iloc[validation_indices]
        
            model = CatBoostClassifier(
                **params_dict,
                use_best_model=True,
                cat_features=cat_features,
                loss_function='Logloss',
                eval_metric='AUC',
                task_type='GPU' if torch.cuda.is_available() else 'CPU',
                devices='0',
                metric_period=1000,
                random_seed=random_seed,
                verbose=False,
                allow_writing_files=False
            )
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=(X_validation_fold, y_validation_fold),
                early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS
            )

            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            y_test_pred_proba = model.predict_proba(test_data)[:, 1]
            seed_oof_preds[validation_indices] = np.array(y_validation_pred_proba)
            test_preds_accumulator += np.array(y_test_pred_proba)

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_catboostclassifier_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"CatBoostClassifier_{index}",
        params_dict=params_dict,
        feature_names=train_data.columns.tolist(),
        get_preds=get_catboostclassifier_preds
    )

### 8.1.2 Add Estimators (CatBoostClassifier)

Add CatBoostClassifier estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [33]:
estimators += [
    get_catboostclassifier_stacking_estimator(
        index=1,
        params_dict={ # Optuna study AUC: 0.7261767336235222
            'iterations': 30000,
            'learning_rate': 0.03933473509871599,
            'depth': 3,
            'l2_leaf_reg': 14.932109771039046,
            'bagging_temperature': 0.13345806085697987,
            'random_strength': 7.486374538597635,
            'min_data_in_leaf': 2,
        }
    ),
    get_catboostclassifier_stacking_estimator(
        index=2,
        params_dict={ # Optuna study AUC: 0.725842155230371
            'iterations': 30000,
            'learning_rate': 0.041779205681346576,
            'depth': 4,
            'l2_leaf_reg': 3.628892496718331,
            'bagging_temperature': 0.1922242909320177,
            'random_strength': 8.464699585881778,
            'min_data_in_leaf': 5,
        }
    ),
     get_catboostclassifier_stacking_estimator(
        index=3,
        params_dict={ # Optuna study AUC: 0.7257614687804782
            'iterations': 30000,
            'learning_rate': 0.08955773312600926,
            'depth': 4,
            'l2_leaf_reg': 8.952470035979275,
            'bagging_temperature': 0.21150772067613666,
            'random_strength': 14.741499198080962,
            'min_data_in_leaf': 1,
        }
    ),
]

## 8.2 XGBClassifier

### 8.2.1 Helper Methods (XGBClassifier)

In [34]:
def get_xgbclassifier_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data))
    test_preds_accumulator = np.zeros(len(test_data))

    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        skf_splits = skf.split(train_data.drop(target_col, axis=1), train_data[target_col])
        skf_enumeration = enumerate(skf_splits)

        seed_oof_preds = np.zeros(len(train_data))

        for fold, (train_indices, validation_indices) in skf_enumeration:
            X_train_fold = train_data.drop(target_col, axis=1).iloc[train_indices]
            X_validation_fold = train_data.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data[target_col].iloc[train_indices]
            y_validation_fold = train_data[target_col].iloc[validation_indices]

            model = XGBClassifier(
                **params_dict,
                tree_method='hist' if torch.cuda.is_available() else 'auto',
                device='cuda' if torch.cuda.is_available() else 'cpu',
                enable_categorical=True,
                objective='binary:logistic',
                eval_metric='auc',
                early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS,
                n_jobs=-1,
                random_state=random_seed,
                verbosity=0
            )
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_validation_fold, y_validation_fold)],
                verbose=False
            )

            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            y_test_pred_proba = model.predict_proba(test_data)[:, 1]
            seed_oof_preds[validation_indices] = np.array(y_validation_pred_proba)
            test_preds_accumulator += np.array(y_test_pred_proba)

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_xgbclassifier_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"XGBClassifier_{index}",
        params_dict=params_dict,
        feature_names=train_data.columns.tolist(),
        get_preds=get_xgbclassifier_preds
    )

### 8.2.2 Add Estimators (XGBClassifier)

Add XGBClassifier estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [35]:
estimators += [
    get_xgbclassifier_stacking_estimator(
        index=1,
        params_dict={ # Optuna study AUC: 0.7275219804910846
            'n_estimators': 30000,
            'learning_rate': 0.00985498815107458,
            'max_depth': 3,
            'subsample': 0.975836120137461,
            'colsample_bytree': 0.5411854284303592,
            'alpha': 9.940781978752474,
            'gamma': 0.008422323405815038,
            'lambda': 0.025214960531620187,
            'min_child_weight': 12,
        }
    ),
    get_xgbclassifier_stacking_estimator(
        index=2,
        params_dict={ # Optuna study AUC: 0.7273817150393508
            'n_estimators': 30000,
            'learning_rate': 0.047179227853488916,
            'max_depth': 3,
            'subsample': 0.9561594029099818,
            'colsample_bytree': 0.5200809916944509,
            'alpha': 9.323686821094613,
            'gamma': 0.06513704074541844,
            'lambda': 0.07573405175712218,
            'min_child_weight': 14,
        }
    ),
    get_xgbclassifier_stacking_estimator(
        index=3,
        params_dict={ # Optuna study AUC: 0.7274144144696422
            'n_estimators': 30000,
            'learning_rate': 0.06778303256075534,
            'max_depth': 3,
            'subsample': 0.9750702612583769,
            'colsample_bytree': 0.5164463777572837,
            'alpha': 6.677223824702266,
            'gamma': 0.06627215758548254,
            'lambda': 0.10239210156952944,
            'min_child_weight': 17,
        }
    ),
    get_xgbclassifier_stacking_estimator(
        index=4,
        params_dict={ # Optuna study AUC: 0.7263868488191946
            'n_estimators': 30000,
            'learning_rate': 0.00992002978574334,
            'max_depth': 6,
            'subsample': 0.6885700003314461,
            'colsample_bytree': 0.5082842329050175,
            'alpha': 4.042835803115786,
            'gamma': 0.19033575052721494,
            'lambda': 1.4531584526994292,
            'min_child_weight': 79,
        }
    ),
    get_xgbclassifier_stacking_estimator(
        index=5,
        params_dict={ # Optuna study AUC: 0.7261995858097773
            'n_estimators': 30000,
            'learning_rate': 0.005092159244819224,
            'max_depth': 8,
            'subsample': 0.6985482460232558,
            'colsample_bytree': 0.5002716122370332,
            'alpha': 0.5442317401534714,
            'gamma': 0.9101677712528158,
            'lambda': 1.4849248721792976,
            'min_child_weight': 86,
        }
    ),
]

## 8.3 XGBRFClassifier

### 8.3.1 Helper Methods (XGBRFClassifier)

In [36]:
def get_xgbrfclassifier_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data_mlp))
    test_preds_accumulator = np.zeros(len(test_data_mlp))
    
    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        
        seed_oof_preds = np.zeros(len(train_data_mlp))
        
        for fold, (train_indices, validation_indices) in enumerate(skf.split(train_data_mlp.drop(target_col, axis=1), train_data_mlp[target_col])):
            X_train_fold = train_data_mlp.drop(target_col, axis=1).iloc[train_indices]
            X_val_fold = train_data_mlp.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data_mlp[target_col].iloc[train_indices]
            
            model = XGBRFClassifier(
                **params_dict,
                tree_method='hist' if torch.cuda.is_available() else 'auto',
                device='cuda' if torch.cuda.is_available() else 'cpu',
                learning_rate=1.0,
                n_jobs=-1,
                random_state=random_seed,
                verbose=0
            )
            
            model.fit(X_train_fold, y_train_fold)
            seed_oof_preds[validation_indices] = model.predict_proba(X_val_fold)[:, 1]
            test_preds_accumulator += model.predict_proba(test_data_mlp)[:, 1]
            
        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_xgbrfclassifier_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"XGBRFClassifier_{index}",
        params_dict=params_dict,
        feature_names=train_data_mlp.columns.tolist(),
        get_preds=get_xgbrfclassifier_preds
    )

### 8.3.2 Add Estimators (XGBRFClassifier)

In [37]:
estimators += [
     get_xgbrfclassifier_stacking_estimator(
        index=1,
        params_dict={ # Optuna study AUC: 0.7093869754986559
            'n_estimators': 385,
            'max_depth': 16,
            'subsample': 0.5315714592374865,
            'colsample_bynode': 0.8874353486231008,
            'reg_alpha': 0.0054867256964063835,
            'reg_lambda': 0.007252617838414462,
        }
    ),
    #  get_xgbrfclassifier_stacking_estimator(
    #     index=2,
    #     params_dict={ # Optuna study AUC: 0.7092307295106085
    #         'n_estimators': 415,
    #         'max_depth': 16,
    #         'subsample': 0.4857798345845685,
    #         'colsample_bynode': 0.8965128787444345,
    #         'reg_alpha': 0.18172738357674292,
    #         'reg_lambda': 0.004947196570367552,
    #     }
    # ),
]

## 8.4 MLPClassifier

### 8.4.1 Helper Methods (MLPClassifier)

In [38]:
def get_mlpclassifier_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data_mlp))
    test_preds_accumulator = np.zeros(len(test_data_mlp))

    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        skf_splits = skf.split(train_data_mlp.drop(target_col, axis=1), train_data_mlp[target_col])
        skf_enumeration = enumerate(skf_splits)

        seed_oof_preds = np.zeros(len(train_data_mlp))

        for fold, (train_indices, validation_indices) in skf_enumeration:
            X_train_fold = train_data_mlp.drop(target_col, axis=1).iloc[train_indices]
            X_validation_fold = train_data_mlp.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data_mlp[target_col].iloc[train_indices]
            y_validation_fold = train_data_mlp[target_col].iloc[validation_indices]

            model = MLPClassifier(
                **params_dict,
                input_dim=X_train_fold.shape[1],
                epochs=100,
                patience=10,
                device=DEVICE
            )
            model.fit(X_train_fold, y_train_fold)

            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            y_test_pred_proba = model.predict_proba(test_data_mlp)[:, 1]
            seed_oof_preds[validation_indices] = np.array(y_validation_pred_proba)
            test_preds_accumulator += np.array(y_test_pred_proba)

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_mlpclassifier_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"MLPClassifier_{index}",
        params_dict=params_dict,
        feature_names=train_data_mlp.columns.tolist(),
        get_preds=get_mlpclassifier_preds
    )

### 8.4.2 Add Estimators (MLPClassifier)

In [39]:
# estimators += [
#     get_mlpclassifier_stacking_estimator(
#         index=1,
#         params_dict={ # Optuna study AUC: 0.6962418548652664
#             'hidden_layers': (128, 64, 32),
#             'dropout': 0.35993977676783095,
#             'learning_rate': 0.0022107162317045424,
#             'batch_size': 1024,
#             'weight_decay': 4.655295266533981e-06,
#         }
#     ),
# ]

# 9. Base Model Predictions

## 9.1 Get Base Model Predictions

In [40]:
stacking_preds_retriever = StackingPredictionsRetriever(
    estimators=estimators,
    working_dir_path="/kaggle/working/",
    train_preds_filename="base_models_train_preds",
    test_preds_filename="base_models_test_preds",
    preds_save_interval=1
)
stacking_preds_retriever.import_preds("/kaggle/input/diabetes-prediction-challenge-base-model-preds/")
stacking_preds_retriever.sync_preds()
stacking_preds_retriever.get_preds()

base_model_train_preds, base_model_test_preds = stacking_preds_retriever.get_current_train_and_test_preds()
base_model_train_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))
base_model_test_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))

[INFO] Importing predictions..
[INFO] 7 train predictions were imported:
CatBoostClassifier_1 (6eca2de83af2fea3676cc9382d0f1011), CatBoostClassifier_2 (b4347d306f6e52d59724382017b55f1b), CatBoostClassifier_3 (fad9fe0d13abe5377a9667ad2452550c), XGBClassifier_1 (3fd5db728b0f635ad3fb51e4ac3a5c8d), XGBClassifier_2 (cd0320203bc21b5b10db068b52204c49), XGBClassifier_3 (801128d9854f893393d31943ee000e82), XGBRFClassifier_1 (aca0224f49af4aff337e4a3bd4021211)
[INFO] 7 test predictions were imported:
CatBoostClassifier_1 (6eca2de83af2fea3676cc9382d0f1011), CatBoostClassifier_2 (b4347d306f6e52d59724382017b55f1b), CatBoostClassifier_3 (fad9fe0d13abe5377a9667ad2452550c), XGBClassifier_1 (3fd5db728b0f635ad3fb51e4ac3a5c8d), XGBClassifier_2 (cd0320203bc21b5b10db068b52204c49), XGBClassifier_3 (801128d9854f893393d31943ee000e82), XGBRFClassifier_1 (aca0224f49af4aff337e4a3bd4021211)
[INFO] Finished importing predictions
[INFO] Syncing predictions..
[INFO] No columns for training predictions were dropped
[IN

## 9.2 Base Models AUC

In [41]:
base_model_auc = pd.Series()
for estimator in base_model_train_preds.columns:
    base_model_auc[estimator] = roc_auc_score(train_data[target_col], base_model_train_preds[estimator])
base_model_auc.sort_values(ascending=False)

XGBClassifier_2 (cd0320203bc21b5b10db068b52204c49)         0.727895
XGBClassifier_3 (801128d9854f893393d31943ee000e82)         0.727864
XGBClassifier_1 (3fd5db728b0f635ad3fb51e4ac3a5c8d)         0.727811
XGBClassifier_4 (9d4a18809c7642a3fc3412c67d01204f)         0.727165
XGBClassifier_5 (2c37b7c40720943b1cc9d0e944d66e8f)         0.727058
CatBoostClassifier_1 (6eca2de83af2fea3676cc9382d0f1011)    0.726697
CatBoostClassifier_3 (fad9fe0d13abe5377a9667ad2452550c)    0.726597
CatBoostClassifier_2 (b4347d306f6e52d59724382017b55f1b)    0.726549
XGBRFClassifier_1 (aca0224f49af4aff337e4a3bd4021211)       0.710963
dtype: float64

# 10. Meta-Model

## 10.1 Meta-Model Hyperparameter Tuning

### 10.1.1 Tuning

In [42]:
# to skip hyperparameter tuning when it's not needed; set to `False` to do the tuning & selection
SKIP_META_MODEL_HYPERPARAMETER_TUNING = False

# maximum number of trials Optuna will conduct for the optimization
META_MODEL_OPTUNA_STUDY_NUM_TRIALS = 50

# number of splits to use for K-Fold Cross-Validation
META_MODEL_KFOLD_NUM_SPLITS = 5

# use different random seeds from ones used to train base models to avoid
# potential leakage or alignment artifacts from original splits
META_MODEL_RANDOM_SEEDS = [77, 99]

# fixed value set for early stopping rounds
META_MODEL_EARLY_STOPPING_ROUNDS = 20

# optuna study best parameters for meta model
meta_model_optuna_study_best_params = {}

# parameters selected for meta model
meta_model_params = {}

In [43]:
def get_meta_model_optuna_params(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 5, 40),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.5),
        'max_depth': trial.suggest_int('max_depth', 1, 2),
        'subsample': trial.suggest_float('subsample', 0.95, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.95, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-3, 1e-1, log=True),
        'lambda': trial.suggest_float('lambda', 1e-3, 1e-1, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 0.4),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

def meta_model_optuna_study_objective(trial):
    meta_model_params = get_meta_model_optuna_params(trial)

    meta_oof_preds_accumulator = np.zeros(len(train_data))

    for random_seed in META_MODEL_RANDOM_SEEDS:
        meta_skf = StratifiedKFold(n_splits=META_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        meta_skf_splits = meta_skf.split(base_model_train_preds, train_data[target_col])
        meta_skf_enumeration = enumerate(meta_skf_splits)
    
        seed_oof_preds = np.zeros(len(train_data))
    
        for fold, (train_indices, validation_indices) in meta_skf_enumeration:
            X_train_fold = base_model_train_preds.iloc[train_indices]
            y_train_fold = train_data[target_col].iloc[train_indices]
            X_validation_fold = base_model_train_preds.iloc[validation_indices]
            y_validation_fold = train_data[target_col].iloc[validation_indices]
    
            model = XGBClassifier(
                **meta_model_params,
                tree_method='hist' if torch.cuda.is_available() else 'auto',
                device='cuda' if torch.cuda.is_available() else 'cpu',
                objective='binary:logistic',
                eval_metric='auc',
                early_stopping_rounds=META_MODEL_EARLY_STOPPING_ROUNDS,
                n_jobs=-1,
                verbosity=0
            )
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_validation_fold, y_validation_fold)],
                verbose=False
            )
    
            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            seed_oof_preds[validation_indices] = y_validation_pred_proba
    
        meta_oof_preds_accumulator += seed_oof_preds
    
    final_meta_oof_preds = meta_oof_preds_accumulator / len(META_MODEL_RANDOM_SEEDS)

    return roc_auc_score(train_data[target_col], final_meta_oof_preds)

In [44]:
if SKIP_META_MODEL_HYPERPARAMETER_TUNING:
    print("Skipped hyperparameter tuning for meta model")
else:
    print("Started hyperparameter tuning for meta model")
    sampler = optuna.samplers.TPESampler(n_ei_candidates=48, multivariate=True)
    study = optuna.create_study(sampler=sampler, direction="maximize")
    study.optimize(meta_model_optuna_study_objective, n_trials=META_MODEL_OPTUNA_STUDY_NUM_TRIALS)
    
    print(f"# trials finished: {len(study.trials)}")
    trial = study.best_trial
    meta_model_optuna_study_best_params = study.best_params
    print(f"Best trial AUC: {trial.value}")
    print(f"Best trial params:")
    for param_key, param_value in meta_model_optuna_study_best_params.items():
        print(f"- {param_key}: {param_value}")

[I 2025-12-13 07:21:14,922] A new study created in memory with name: no-name-a95e01bf-dd35-4d53-9b4a-d00be31d7b27


Started hyperparameter tuning for meta model


[I 2025-12-13 07:21:22,050] Trial 0 finished with value: 0.7274594115199123 and parameters: {'n_estimators': 14, 'learning_rate': 0.39885227979576166, 'max_depth': 2, 'subsample': 0.9521177063497699, 'colsample_bytree': 0.9698546225661753, 'alpha': 0.016660447357274684, 'lambda': 0.04519704853514479, 'gamma': 0.3778902700586454, 'min_child_weight': 3}. Best is trial 0 with value: 0.7274594115199123.
[I 2025-12-13 07:21:29,715] Trial 1 finished with value: 0.7276890417285579 and parameters: {'n_estimators': 26, 'learning_rate': 0.4918849843584917, 'max_depth': 2, 'subsample': 0.9799467895883663, 'colsample_bytree': 0.9993643357409052, 'alpha': 0.06537237511552815, 'lambda': 0.004395589192906231, 'gamma': 0.39371118773846925, 'min_child_weight': 10}. Best is trial 1 with value: 0.7276890417285579.
[I 2025-12-13 07:21:36,686] Trial 2 finished with value: 0.727315268971535 and parameters: {'n_estimators': 10, 'learning_rate': 0.10998813726664572, 'max_depth': 2, 'subsample': 0.965300540168

# trials finished: 50
Best trial AUC: 0.727842348495999
Best trial params:
- n_estimators: 36
- learning_rate: 0.43216785872096874
- max_depth: 2
- subsample: 0.9893537447376771
- colsample_bytree: 0.9745543347061765
- alpha: 0.003693800287139703
- lambda: 0.005866554189751117
- gamma: 0.19827480659498736
- min_child_weight: 10


### 10.1.2 Tuned Hyperparameters

In [45]:
# default values (most found from previous tuning/selection)
META_MODEL_DEFAULT_N_ESTIMATORS = 28
META_MODEL_DEFAULT_LEARNING_RATE = 0.1549619758916973
META_MODEL_DEFAULT_MAX_DEPTH = 2
META_MODEL_DEFAULT_SUBSAMPLE = 0.9628607950034527
META_MODEL_DEFAULT_COLSAMPLE_BY_TREE = 0.9883747386630549
META_MODEL_DEFAULT_ALPHA = 0.04355046472776896
META_MODEL_DEFAULT_GAMMA = 0.19182991978484562
META_MODEL_DEFAULT_LAMBDA = 0.0024107325767637354
META_MODEL_DEFAULT_MIN_CHILD_WEIGHT = 1

# meta model parameters
meta_model_params['n_estimators'] = meta_model_optuna_study_best_params.get('n_estimators', META_MODEL_DEFAULT_N_ESTIMATORS)
meta_model_params['learning_rate'] = meta_model_optuna_study_best_params.get('learning_rate', META_MODEL_DEFAULT_LEARNING_RATE)
meta_model_params['max_depth'] = meta_model_optuna_study_best_params.get('max_depth', META_MODEL_DEFAULT_MAX_DEPTH)
meta_model_params['subsample'] = meta_model_optuna_study_best_params.get('subsample', META_MODEL_DEFAULT_SUBSAMPLE)
meta_model_params['colsample_bytree'] = meta_model_optuna_study_best_params.get('colsample_bytree', META_MODEL_DEFAULT_COLSAMPLE_BY_TREE)
meta_model_params['alpha'] = meta_model_optuna_study_best_params.get('alpha', META_MODEL_DEFAULT_ALPHA)
meta_model_params['gamma'] = meta_model_optuna_study_best_params.get('gamma', META_MODEL_DEFAULT_GAMMA)
meta_model_params['lambda'] = meta_model_optuna_study_best_params.get('lambda', META_MODEL_DEFAULT_LAMBDA)
meta_model_params['min_child_weight'] = meta_model_optuna_study_best_params.get('min_child_weight', META_MODEL_DEFAULT_MIN_CHILD_WEIGHT)
print(f"The following tuned parameters will be used for the meta model:")
for param_key, param_value in meta_model_params.items():
        print(f"- {param_key}: {param_value}")

The following tuned parameters will be used for the meta model:
- n_estimators: 36
- learning_rate: 0.43216785872096874
- max_depth: 2
- subsample: 0.9893537447376771
- colsample_bytree: 0.9745543347061765
- alpha: 0.003693800287139703
- gamma: 0.19827480659498736
- lambda: 0.005866554189751117
- min_child_weight: 10


## 10.2 Meta-Model Training

In [46]:
meta_oof_preds_accumulator = np.zeros(len(train_data))
meta_test_preds_accumulator = np.zeros(len(test_data))
meta_train_feature_importances_accumulator = np.zeros(len(base_model_train_preds.columns))

for random_seed in META_MODEL_RANDOM_SEEDS:
    meta_skf = StratifiedKFold(n_splits=META_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
    meta_skf_splits = meta_skf.split(base_model_train_preds, train_data[target_col])
    meta_skf_enumeration = enumerate(meta_skf_splits)

    seed_oof_preds = np.zeros(len(train_data))

    for fold, (train_indices, validation_indices) in meta_skf_enumeration:
        X_train_fold = base_model_train_preds.iloc[train_indices]
        y_train_fold = train_data[target_col].iloc[train_indices]
        X_validation_fold = base_model_train_preds.iloc[validation_indices]
        y_validation_fold = train_data[target_col].iloc[validation_indices]

        meta_model = XGBClassifier(
            **meta_model_params,
            tree_method='hist' if torch.cuda.is_available() else 'auto',
            device='cuda' if torch.cuda.is_available() else 'cpu',
            objective='binary:logistic',
            eval_metric='auc',
            early_stopping_rounds=META_MODEL_EARLY_STOPPING_ROUNDS,
            n_jobs=-1,
            verbosity=0
        )
        meta_model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_validation_fold, y_validation_fold)],
            verbose=False
        )

        y_validation_pred_proba = meta_model.predict_proba(X_validation_fold)[:, 1]
        y_test_pred_proba = meta_model.predict_proba(base_model_test_preds)[:, 1]
        seed_oof_preds[validation_indices] = np.array(y_validation_pred_proba)
        meta_test_preds_accumulator += np.array(y_test_pred_proba)
        meta_train_feature_importances_accumulator += np.array(meta_model.feature_importances_)

    meta_oof_preds_accumulator += seed_oof_preds

final_meta_oof_preds = meta_oof_preds_accumulator / len(META_MODEL_RANDOM_SEEDS)
final_meta_test_preds = meta_test_preds_accumulator / (META_MODEL_KFOLD_NUM_SPLITS * len(META_MODEL_RANDOM_SEEDS))
meta_train_feature_importances = meta_train_feature_importances_accumulator / (META_MODEL_KFOLD_NUM_SPLITS * len(META_MODEL_RANDOM_SEEDS))

## 10.3 Meta-Model Feature Importances

In [47]:
meta_model_feature_importances = pd.Series(meta_train_feature_importances)
meta_model_feature_importances.index = base_model_train_preds.columns
meta_model_feature_importances.sort_values(ascending=False)

XGBClassifier_2 (cd0320203bc21b5b10db068b52204c49)         0.328013
XGBClassifier_3 (801128d9854f893393d31943ee000e82)         0.263920
XGBClassifier_1 (3fd5db728b0f635ad3fb51e4ac3a5c8d)         0.258549
CatBoostClassifier_3 (fad9fe0d13abe5377a9667ad2452550c)    0.051370
XGBClassifier_4 (9d4a18809c7642a3fc3412c67d01204f)         0.037822
CatBoostClassifier_2 (b4347d306f6e52d59724382017b55f1b)    0.027154
CatBoostClassifier_1 (6eca2de83af2fea3676cc9382d0f1011)    0.017165
XGBClassifier_5 (2c37b7c40720943b1cc9d0e944d66e8f)         0.014543
XGBRFClassifier_1 (aca0224f49af4aff337e4a3bd4021211)       0.001464
dtype: float64

## 10.4 Final Adjustments to Test Predictions

In [48]:
def min_max_scale(preds):
    min_val = preds.min()
    max_val = preds.max()
    if max_val > min_val:
        return (preds - min_val) / (max_val - min_val)
    return preds

# scale final meta oof/test preds
scaled_final_meta_oof_preds = min_max_scale(final_meta_oof_preds)
scaled_final_meta_test_preds = min_max_scale(final_meta_test_preds)

# just in case floating point math leaves values very slightly below 0 or above 1
scaled_final_meta_oof_preds = np.clip(scaled_final_meta_oof_preds, 0, 1)
scaled_final_meta_test_preds = np.clip(scaled_final_meta_test_preds, 0, 1)

## 10.5 Meta-Model AUC

In [49]:
meta_model_auc = roc_auc_score(train_data[target_col], scaled_final_meta_oof_preds)
print(meta_model_auc)

0.727842348495999


# 11. Submission

In [50]:
# prepare submission
submission = pd.DataFrame({'id': test_data.index, target_col: scaled_final_meta_test_preds})
submission.to_csv('submission.csv', index=False)
print('Submission file prepared.')

Submission file prepared.
