# 1. Overview

This is a notebook for training models to submit predictions to the "Diabetes Prediction Challenge" Kaggle competition ([playground-series-s5e12](https://www.kaggle.com/competitions/playground-series-s5e12)).

Synthetic data is used for this playground competition, and the objective is to, for each patient in the test set, predict the probability that the patient will be diagnosed with diabetes.

# 2. Setup

## 2.1 Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import optuna
import os
import hashlib as hl # for StackingEstimator
import inspect # for StackingEstimator
import random
import warnings
from catboost import CatBoostClassifier
from enum import Enum
from pathlib import Path # for StackingPredictionsRetriever
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder
from types import FunctionType
from xgboost import XGBClassifier

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

pd.set_option('display.max_colwidth', None) # Display full column content
pd.set_option('display.max_rows', None) # Display all rows
pd.set_option('display.width', 1000) # Set larger display width

## 2.2 Reproducibility

For reproducibility of results, an arbitrary number will be used for the random seed.

In [2]:
RANDOM_SEEDS = [11, 42]
random.seed(RANDOM_SEEDS[0])
np.random.seed(RANDOM_SEEDS[0])
torch.manual_seed(RANDOM_SEEDS[0])
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEEDS[0])
    torch.cuda.manual_seed_all(RANDOM_SEEDS[0])

## 2.3 DataFrames

Read the data provided for the competition into dataframes.

In [3]:
INPUT_DIR = '/kaggle/input'
orig_train_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s5e12/train.csv'))
orig_test_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s5e12/test.csv'))

# set index
orig_train_data.set_index('id', inplace=True)
orig_test_data.set_index('id', inplace=True)

# target column
target_col = "diagnosed_diabetes"

# 3. Exploratory Data Analysis

In [4]:
# to skip the generation of plots (e.g. KDE) in this section that take time; set to False to generate the plots 
SKIP_PLOTS = True

In [5]:
orig_train_data.describe()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
count,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0,700000.0
mean,50.359734,2.072411,80.230803,5.963695,7.0022,6.012733,25.874684,0.858766,116.294193,75.440924,70.167749,186.818801,53.823214,102.905854,123.08185,0.149401,0.18199,0.030324,0.623296
std,11.65552,1.048189,51.195071,1.463336,0.901907,2.022707,2.860705,0.03798,11.01039,6.825775,6.938722,16.730832,8.266545,19.022416,24.739397,0.356484,0.385837,0.171478,0.48456
min,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.68,91.0,51.0,42.0,117.0,21.0,51.0,31.0,0.0,0.0,0.0,0.0
25%,42.0,1.0,49.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,175.0,48.0,89.0,106.0,0.0,0.0,0.0,0.0
50%,50.0,2.0,71.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0,1.0
75%,58.0,3.0,96.0,7.0,7.6,7.4,27.8,0.88,124.0,80.0,75.0,199.0,59.0,116.0,139.0,0.0,0.0,0.0,1.0
max,89.0,9.0,747.0,9.9,9.9,16.5,38.4,1.05,163.0,104.0,101.0,289.0,90.0,205.0,290.0,1.0,1.0,1.0,1.0


In [6]:
orig_test_data.describe()

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,family_history_diabetes,hypertension_history,cardiovascular_history
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,50.432397,2.089693,92.349087,5.945838,6.997795,6.011278,25.881906,0.859007,116.374117,75.396013,70.04835,187.30862,53.813557,103.416083,123.53848,0.15292,0.18441,0.03311
std,11.938741,1.066214,62.187399,1.481068,0.914693,2.060472,2.894289,0.038523,11.252146,6.95034,7.090543,18.413053,8.398126,20.571855,28.965441,0.359911,0.387819,0.178924
min,19.0,1.0,1.0,0.1,3.1,0.6,15.1,0.69,91.0,51.0,42.0,107.0,22.0,51.0,31.0,0.0,0.0,0.0
25%,42.0,1.0,51.0,5.0,6.4,4.6,23.9,0.83,108.0,71.0,65.0,174.0,48.0,89.0,104.0,0.0,0.0,0.0
50%,50.0,2.0,77.0,6.0,7.0,6.0,25.9,0.86,116.0,75.0,70.0,187.0,54.0,103.0,123.0,0.0,0.0,0.0
75%,59.0,3.0,115.0,7.0,7.6,7.4,27.8,0.89,124.0,80.0,75.0,200.0,60.0,117.0,142.0,0.0,0.0,0.0
max,89.0,9.0,748.0,9.9,9.9,15.9,38.3,1.05,170.0,104.0,101.0,285.0,91.0,226.0,290.0,1.0,1.0,1.0


In [7]:
numeric_col_names = orig_train_data.select_dtypes(include='number').columns.to_series()
categorical_col_names = orig_train_data.select_dtypes(include='object').columns.to_series()
assert numeric_col_names.size + categorical_col_names.size == orig_train_data.shape[1]

# drop target column from numeric column names
numeric_col_names.drop(target_col, inplace=True)

In [8]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} missing values #####")
    print(dataset.isnull().sum())
    print()

##### Train data missing values #####
age                                   0
alcohol_consumption_per_week          0
physical_activity_minutes_per_week    0
diet_score                            0
sleep_hours_per_day                   0
screen_time_hours_per_day             0
bmi                                   0
waist_to_hip_ratio                    0
systolic_bp                           0
diastolic_bp                          0
heart_rate                            0
cholesterol_total                     0
hdl_cholesterol                       0
ldl_cholesterol                       0
triglycerides                         0
gender                                0
ethnicity                             0
education_level                       0
income_level                          0
smoking_status                        0
employment_status                     0
family_history_diabetes               0
hypertension_history                  0
cardiovascular_history                0
di

In [9]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} categorical cols unique values #####")
    for categorical_col_name in categorical_col_names:
        print(f"{categorical_col_name}:")
        print(dataset[categorical_col_name].unique())
    print()

##### Train data categorical cols unique values #####
gender:
['Female' 'Male' 'Other']
ethnicity:
['Hispanic' 'White' 'Asian' 'Black' 'Other']
education_level:
['Highschool' 'Graduate' 'Postgraduate' 'No formal']
income_level:
['Lower-Middle' 'Upper-Middle' 'Low' 'Middle' 'High']
smoking_status:
['Current' 'Never' 'Former']
employment_status:
['Employed' 'Retired' 'Student' 'Unemployed']

##### Test data categorical cols unique values #####
gender:
['Female' 'Male' 'Other']
ethnicity:
['White' 'Hispanic' 'Black' 'Asian' 'Other']
education_level:
['Highschool' 'Graduate' 'Postgraduate' 'No formal']
income_level:
['Middle' 'Low' 'Lower-Middle' 'Upper-Middle' 'High']
smoking_status:
['Former' 'Never' 'Current']
employment_status:
['Employed' 'Unemployed' 'Retired' 'Student']



In [10]:
# KDE plots of target variable and numerical features
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 24))
    kdeplot_col_names = [target_col]
    kdeplot_col_names.extend(numeric_col_names)
    for i, col in enumerate(kdeplot_col_names, start=1):
        plt.subplot(10, 2, i)
        sns.kdeplot(data=orig_train_data, x=col, fill=True)
        plt.tight_layout()
        plt.title(f"KDE plot of {col}")

In [11]:
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        orig_train_data[numeric_col_names].corr(),
        cmap='Reds',
        annot=True,
        linewidths=2,
        fmt='.2f',
        vmin=-1,
        vmax=1
    )
    plt.title('Correlation Matrix of Numerical Features', fontsize=18, pad=20)
    plt.show()

# 4. Data Pre-Processing

In [12]:
train_data = orig_train_data.copy()
test_data = orig_test_data.copy()

## 4.1 Ordinal Encoding

In [13]:
# education level
education_level_encoder = OrdinalEncoder(categories=[['No formal', 'Highschool', 'Graduate', 'Postgraduate']])
train_data['education_level_encoded'] = education_level_encoder.fit_transform(train_data[['education_level']])
test_data['education_level_encoded'] = education_level_encoder.fit_transform(test_data[['education_level']])

# income level
income_level_encoder = OrdinalEncoder(categories=[['Low', 'Lower-Middle','Middle', 'Upper-Middle', 'High']])
train_data['income_level_encoded'] = income_level_encoder.fit_transform(train_data[['income_level']])
test_data['income_level_encoded'] = income_level_encoder.fit_transform(test_data[['income_level']])

# smoking status
smoking_status_encoder = OrdinalEncoder(categories=[['Never', 'Former', 'Current']])
train_data['smoking_status_encoded'] = smoking_status_encoder.fit_transform(train_data[['smoking_status']])
test_data['smoking_status_encoded'] = smoking_status_encoder.fit_transform(test_data[['smoking_status']])

# drop original cols
for col in ['income_level', 'education_level', 'smoking_status']:
    train_data.drop(col, axis=1, inplace=True)
    test_data.drop(col, axis=1, inplace=True)

# print out value maps to check assigned values are as expected
for (encoded_col_name, encoder) in [
    ('education_level_encoded', education_level_encoder),
    ('income_level_encoded', income_level_encoder),
    ('smoking_status_encoded', smoking_status_encoder),
]:
    categories = encoder.categories_[0]
    value_map = { category: i for i, category in enumerate(categories) }
    print(f"{encoded_col_name}:\n{value_map}")

education_level_encoded:
{'No formal': 0, 'Highschool': 1, 'Graduate': 2, 'Postgraduate': 3}
income_level_encoded:
{'Low': 0, 'Lower-Middle': 1, 'Middle': 2, 'Upper-Middle': 3, 'High': 4}
smoking_status_encoded:
{'Never': 0, 'Former': 1, 'Current': 2}


## 4.3 Data Cleaning

In [14]:
def fix_blood_pressure(df):
    mask = df['diastolic_bp'] > df['systolic_bp']
    df.loc[mask, ['systolic_bp', 'diastolic_bp']] = (
        df.loc[mask, ['diastolic_bp', 'systolic_bp']].values
    )
    return df

train_data = fix_blood_pressure(train_data)
test_data = fix_blood_pressure(test_data)

## 4.4 Feature Generation

In [15]:
def add_generated_features(df):
    # medical ratios & interactions
    df['cholesterol_ratio'] = df['cholesterol_total'] / df['hdl_cholesterol']
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    df['age_bmi_interaction'] = df['bmi'] * df['age']

    # risk grouping
    df['comorbidity_count'] = (
        df['hypertension_history'] + 
        df['cardiovascular_history'] + 
        df['family_history_diabetes']
    )

    # log transforms for skewed data
    for col in ['triglycerides', 'ldl_cholesterol', 'cholesterol_total']:
        df[f'log_{col}'] = np.log1p(df[col])

    # binning
    df['bmi_cat'] = pd.cut(df['bmi'], bins=[-1, 25, 30, 100], labels=[0, 1, 2]).astype(int)

add_generated_features(train_data)
add_generated_features(test_data)

## 4.4 Remaining Categorical Features

In [16]:
cat_features = train_data.drop(target_col, axis=1).select_dtypes(include='object').columns.to_list()
if len(cat_features) > 0:
    for col in cat_features:
        train_data[col] = train_data[col].astype('category')
        test_data[col] = test_data[col].astype('category')

# 5. Stacking Initial Setup

We'll use stacking, an [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) strategy, to generate the predictions. As we'll need to gather predictions from various base models (a.k.a. level-0 models) to feed as input features to a meta model (a.k.a. level-1 model), in order to streamline the process of experimenting with different combinations of base models, some helper classes will be defined in this section. These classes can also be found [here](https://github.com/chuo-v/machine-learning-utils/blob/master/ensemble-learning/stacking/stacking_predictions_retriever.py) at one of my GitHub repositories used to organize some utilities I implemented for machine learning.

In [17]:
class StackingEstimator:
    """
    A class representing an estimator that will be used for stacking, an ensemble learning strategy.

    Intended to be used in conjunction with the `StackingPredictionsRetriever` class, which helps
    retrieve predictions for multiple instances of `StackingEstimator`; as the predictions are saved
    in files, on subsequent requests to retrieve predictions, even as the set of estimators has been
    modified, the `StackingPredictionsRetriever` class can determine the predictions of estimators
    that are non-stale and available (if any) by using the `get_hash` method of the `StackingEstimator`
    class to determine the relevance and staleness of any saved predictions.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimator are determinstic, i.e. they are exactly the same everytime the estimator is
    run with the same inputs (`name`, `params_dict`, `feature_names`, `get_predictions`).
    """
    name = ""
    params_dict = {}
    feature_names = []
    get_predictions = lambda: None

    def __init__(
        self,
        name: str,
        feature_names: [str],
        params_dict: {},
        get_preds: FunctionType
    ):
        """
        Initializes a new instance of `StackingEstimator`.

        :param name:
            A string representing a name for the estimator. It is used for the column names of
            the training and test predictions for each estimator, and is also used as an input
            to calculate a hash value for the estimator. It is recommended to use a different
            name from the names used for other estimators passed to `StackingPredictionsRetriever`.
        :param feature_names:
            A list of strings representing the names of the features that will be used for the
            estimator. It will be passed as an argument to `get_preds`. Internally, it is only
            used as an input to calculate a hash value for the estimator.
        :param params_dict:
            A dictionary of parameters that will be specified for the estimator. It will be
            passed as an argument to `get_preds`. Internally, it is only used as an input
            to calculate a hash value for the estimator.
        :param get_preds:
            A function for getting the predictions for the estimator. It should only take two
            arguments: 'params_dict' and 'feature_names', and should return predictions for
            the training and test data (in that order) as a tuple of two `pandas.Series`.
        """
        # parameter check
        if not isinstance(name, str):
            raise ValueError("`name` argument should be of type `str`")
        if not isinstance(feature_names, list):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should be of type `list`")
        elif not all(isinstance(feature_name, str) for feature_name in feature_names):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should only contain instances of `str`")
        if not isinstance(params_dict, dict):
            raise ValueError(f"`params_dict` argument for estimator \"{name}\" should be of type `dict`")
        get_preds_params = inspect.signature(get_preds).parameters.values()
        get_preds_param_names = [param.name for param in get_preds_params]
        if len(get_preds_param_names) != 2:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take two arguments")
        elif "params_dict" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"params_dict\" argument")
        elif "feature_names" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"feature_names\" argument")

        self.name = name
        self.feature_names = feature_names
        self.params_dict = params_dict
        self.get_preds = get_preds

    def get_hash_value(self):
        """
        Calculates and returns a hash value for the estimator using
        `name`, `feature_names` and `params_dict` as inputs.
        """
        feature_names_str = "_".join(sorted(self.feature_names))
        params_dict_str = "_".join(f"{key}-{value}" for (key, value) in sorted(self.params_dict.items()))
        hash_input_str = "_".join([self.name, feature_names_str, params_dict_str])
        md5_hash = hl.md5(hash_input_str.encode('utf-8')).hexdigest()
        return md5_hash

class StackingPredictionsRetriever:
    """
    A class for streamlining stacking (an ensemble learning strategy) that saves predictions
    from estimators to file so that when trying out different combinations of (base) estimators,
    the predictions that are not stale can be reused, saving the time of having the estimators
    make predictions again.

    Intended to be used in conjunction with the `StackingEstimator` class. The `hash_value` of
    `StackingEstimator` is used to determine the staleness and relevance of the predictions for
    an estimator. The implementation for making predictions using an estimator needs to be
    provided as a function to `get_preds` for `StackingEstimator`; when predictions need to be
    made using an estimator, this class will call `get_preds` for the `StackingEstimator` instance.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimators are determinstic, i.e. they are exactly the same everytime a
    `StackingEstimator` instance is run with the same inputs.
    """
    estimators = []
    working_dir_path = ""
    train_preds_filename = ""
    test_preds_filename = ""
    preds_save_interval = 0

    def __init__(
        self,
        estimators: [StackingEstimator],
        working_dir_path: str,
        train_preds_filename: str = "train_preds",
        test_preds_filename: str = "test_preds",
        preds_save_interval: int = 5,
    ):
        """
        Initializes a new instance of `StackingPredictionsRetriever`.

        :param estimators:
            A list of `StackingEstimator` instances for which the class will retrieve predictions.
        :param working_dir_path:
            The path for the working directory where the files with predictions will be saved.
        :param train_preds_filename:
            The name of the file in which predictions for the training set will be stored.
        :param test_preds_filename:
            The name of the file in which predictions for the test set will be stored.
        :param preds_save_interval:
            An integer which specifies the interval at which predictions will be saved when
            `get_preds` is called, corresponding to the number of estimators whose predictions
            have been retrieved since the predictions were previously saved. Any estimators
            whose predictions are not stale and therefore were not required to make predictions
            again are not included in this number.
        """
        # parameter check
        if not isinstance(estimators, list):
            raise ValueError("`estimators` must be passed as a list")
        if not all(isinstance(e, StackingEstimator) for e in estimators):
            raise ValueError("`estimators` should only contain instances of `StackingEstimator`")
        if not isinstance(working_dir_path, str):
            raise ValueError("`working_dir_path` argument should be of type `str`")
        if not isinstance(preds_save_interval, int):
            raise ValueError("`preds_save_interval` argument should be of type `int`")

        self.estimators = estimators
        self.working_dir_path = working_dir_path
        self.train_preds_filename = train_preds_filename
        self.test_preds_filename = test_preds_filename
        self.preds_save_interval = preds_save_interval

    def get_train_preds_file_path(self):
        """
        Returns the file path for storing predictions for training data.
        """
        return Path(f"{self.working_dir_path}/{self.train_preds_filename}.csv")

    def get_test_preds_file_path(self):
        """
        Returns the file path for storing predictions for test data.
        """
        return Path(f"{self.working_dir_path}/{self.test_preds_filename}.csv")

    def get_current_train_and_test_preds(self):
        """
        Returns the current predictions for training and test data (in that order)
        as a tuple of two `pandas.DataFrame`.

        The predictions are attempted to be retrieved from the file paths returned
        by `get_train_preds_file_path` and `get_test_preds_file_path`; if there are
        any issues with doing so (e.g. file does not exist, dataframe is empty),
        empty dataframes will be returned instead.
        In the case an `pandas.errors.EmptyDataError` exception is raised when
        reading from a file, the corresponding file will be removed.
        """
        curr_train_preds = pd.DataFrame()
        curr_test_preds = pd.DataFrame()
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            try:
                curr_train_preds = pd.read_csv(train_preds_file_path)
            except pd.errors.EmptyDataError:
                train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            try:
                curr_test_preds = pd.read_csv(test_preds_file_path)
            except pd.errors.EmptyDataError:
                test_preds_file_path.unlink()

        return curr_train_preds, curr_test_preds

    def get_preds(self):
        """
        Retrieves predictions from all estimators in `estimators`, storing them in
        two files at the file paths specified by `working_dir_path`,
        `train_preds_filename` and `test_preds_filename`.

        If non-stale (relevant) predictions are found for an estimator, retrieval
        of predictions by calling `get_preds` on the estimator will be skipped,
        and the existing predictions for the estimator will be kept.
        """
        print("[INFO] Getting predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        preds_retrieved_count = 0
        num_preds_retrieved_but_not_yet_saved = 0
        estimators_skipped = []

        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"

            # skip retrieving predictions for estimator if non-stale predictions are already available
            train_preds_available = any(estimator_hash_value in col_name for col_name in curr_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in curr_test_preds.columns)
            if train_preds_available and test_preds_available:
                estimators_skipped += [estimator_name]
                continue

            print(f"[INFO] Getting predictions for estimator {estimator_name}")
            train_preds, test_preds = estimator.get_preds(estimator.params_dict, estimator.feature_names)
            if not isinstance(train_preds, pd.core.series.Series):
                raise ValueError("`train_preds` should be of type `pandas.Series`")
            if not isinstance(test_preds, pd.core.series.Series):
                raise ValueError("`test_preds` should be of type `pandas.Series`")
            curr_train_preds[estimator_name] = train_preds
            curr_test_preds[estimator_name] = test_preds
            preds_retrieved_count += 1

            # save predictions at an interval of `preds_save_interval`
            if preds_retrieved_count % self.preds_save_interval == 0:
                curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
                curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
                num_preds_retrieved_but_not_yet_saved = 0
                print("[INFO] Saved predictions")
            else:
                num_preds_retrieved_but_not_yet_saved += 1

        if estimators_skipped:
            estimators_skipped.sort()
            formatted_estimators = ", ".join(estimators_skipped)
            print(f"[INFO] Skipped retrieving predictions for following estimators as their current ones are not stale:\n{formatted_estimators}")

        if num_preds_retrieved_but_not_yet_saved != 0:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            print("[INFO] Saved predictions")

        print("[INFO] Finished getting all predictions")

    def sync_preds(self):
        """
        Syncs the predictions stored at the two file paths specified by
        `working_dir_path`, `train_preds_filename` and `test_preds_filename` by
        removing predictions for any estimator that is not currently in `estimators`.

        Note that new predictions for estimators that do not currently have predictions
        in the files will not be added; `get_preds` should be used for this purpose
        instead.
        """
        print("[INFO] Syncing predictions..")
        estimator_hash_values = [estimator.get_hash_value() for estimator in self.estimators]
        should_remove_col = lambda col_name: not any(hash_value in col_name for hash_value in estimator_hash_values)

        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        if not curr_train_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_train_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from training predictions:\n{col_names_to_remove}")
                curr_train_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_train_preds.to_csv(self.get_train_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for training predictions were dropped")
        if not curr_test_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_test_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from test predictions:\n{col_names_to_remove}")
                curr_test_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_test_preds.to_csv(self.get_test_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for test predictions were dropped")

        print("[INFO] Finished syncing predictions")

    def import_preds(self, input_dir_path):
        """
        Imports predictions stored at the two file paths at `input_dir_path` with
        `train_preds_filename` and `test_preds_filename` as their filenames. If no
        such files are found, no predictions will be imported.

        Only predictions for estimators specified in `estimators` will be imported.
        Any predictions for estimators that were already available will be overwritten
        with predictions for the same estimators found in the files at `input_dir_path`.

        :param input_dir_path:
            The path to the directory for the training and test predictions files.
            The file names are expected to be the same as `train_preds_filename`
            and `test_preds_filename`
        """
        print("[INFO] Importing predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()
        input_train_preds = pd.DataFrame()
        input_test_preds = pd.DataFrame()

        input_train_preds_path = Path(f"{input_dir_path}/{self.train_preds_filename}.csv")
        input_test_preds_path = Path(f"{input_dir_path}/{self.test_preds_filename}.csv")
        if input_train_preds_path.is_file():
            try:
                input_train_preds = pd.read_csv(input_train_preds_path)
            except: pass
        if input_test_preds_path.is_file():
            try:
                input_test_preds = pd.read_csv(input_test_preds_path)
            except: pass

        estimators_with_imported_train_preds = []
        estimators_with_imported_test_preds = []
        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"
            train_preds_available = any(estimator_hash_value in col_name for col_name in input_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in input_test_preds.columns)

            if train_preds_available:
                curr_train_preds[estimator_name] = input_train_preds[estimator_name]
                estimators_with_imported_train_preds += [estimator_name]
            if test_preds_available:
                curr_test_preds[estimator_name] = input_test_preds[estimator_name]
                estimators_with_imported_test_preds += [estimator_name]

        if not estimators_with_imported_train_preds:
            print("[INFO] No train predictions were imported")
        else:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_train_preds)
            print(f"[INFO] {len(estimators_with_imported_train_preds)} train predictions were imported:\n{formatted_estimators}")
        if not estimators_with_imported_test_preds:
            print("[INFO] No test predictions were imported")
        else:
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_test_preds)
            print(f"[INFO] {len(estimators_with_imported_test_preds)} test predictions were imported:\n{formatted_estimators}")
        
        print("[INFO] Finished importing predictions")

    def clear_preds(self):
        """
        Removes all stored predictions by deleting the two files at filepaths specified
        by `working_dir_path`, `train_preds_filename` and `test_preds_filename`.
        """
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            test_preds_file_path.unlink()

        print("[INFO] Finished clearing predictions")

Next, we'll simply create a variable for storing the estimators (`StackingEstimator` instances) that we'll pass to the `StackingPredictionsRetriever` class for getting all the predictions from our base models.

In [18]:
estimators = []

# 6. Base Model Hyperparameter Tuning

In [19]:
# to skip hyperparameter tuning when it's not needed; set to `False` to do the tuning
SKIP_BASE_MODEL_HYPERPARAMETER_TUNING = True

In [20]:
class BaseModelOptunaStudyEstimator(Enum):
    CATBOOST = "CatBoost"
    XGBCLASSIFIER = "XGBClassifier"

Manually configure the values for the following variables for different studies.

In [21]:
# estimator to use for Optuna study
BASE_MODEL_OPTUNA_STUDY_ESTIMATOR = BaseModelOptunaStudyEstimator.XGBCLASSIFIER

# maximum number of trials Optuna will conduct for the optimization
BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS = 500

# number of splits to use for Stratified K-Fold Cross-Validation for Optuna study
BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS = 3

In [22]:
def get_base_model_optuna_params(trial, study_estimator):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOST:
        return {
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
            "depth": trial.suggest_int("depth", 4, 10),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 30),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 30),
            'random_strength': trial.suggest_float('random_strength', 0, 20),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 20),
        }
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBCLASSIFIER:
        return {
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'alpha': trial.suggest_float('alpha', 0.001, 10.0, log=True),
            'gamma': trial.suggest_float('gamma', 0.001, 10.0, log=True),
            'lambda': trial.suggest_float('lambda', 0.001, 5.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        }
    else:
        raise ValueError("Unsupported optuna study estimator")

def get_base_model_predictions(study_estimator, trial_params, X_train_fold, y_train_fold, X_validation_fold, y_validation_fold):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOST:
        model = CatBoostClassifier(
            iterations=5000,
            learning_rate=trial_params['learning_rate'],
            depth=trial_params['depth'],
            l2_leaf_reg=trial_params['l2_leaf_reg'],
            bagging_temperature=trial_params['bagging_temperature'],
            random_strength=trial_params['random_strength'],
            min_data_in_leaf=trial_params['min_data_in_leaf'],
            od_type='Iter',
            od_wait=50,
            use_best_model=True,
            cat_features=cat_features,
            eval_metric='AUC',
            task_type='GPU' if torch.cuda.is_available() else 'CPU',
            devices='0',
            metric_period=1000,
            random_seed=RANDOM_SEEDS[0],
            verbose=False,
            allow_writing_files=False
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_validation_fold, y_validation_fold),
            early_stopping_rounds=50
        )
        return model.predict_proba(X_validation_fold)[:, 1]
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBCLASSIFIER:
        model = XGBClassifier(
            n_estimators=5000,
            learning_rate=trial_params['learning_rate'],
            max_depth=trial_params['max_depth'],
            subsample=trial_params['subsample'],
            colsample_bytree=trial_params['colsample_bytree'],
            alpha=trial_params['alpha'],
            gamma=trial_params['gamma'],
            reg_lambda=trial_params['lambda'],
            min_child_weight=trial_params['min_child_weight'],
            tree_method='gpu_hist' if torch.cuda.is_available() else 'auto',
            predictor='gpu_predictor' if torch.cuda.is_available() else 'cpu_predictor',
            device='cuda' if torch.cuda.is_available() else 'cpu',
            enable_categorical=True,
            eval_metric='auc',
            n_jobs=-1,
            random_state=RANDOM_SEEDS[0],
            verbosity=0
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_validation_fold, y_validation_fold)],
            early_stopping_rounds=50,
            verbose=False
        )
        return model.predict_proba(X_validation_fold)[:, 1]
    else:
        raise ValueError("Unsupported optuna study estimator")

def base_model_optuna_study_objective(trial):
    base_model_params = get_base_model_optuna_params(trial, BASE_MODEL_OPTUNA_STUDY_ESTIMATOR)

    base_model_optuna_study_skf = StratifiedKFold(n_splits=BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS, shuffle=True, random_state=RANDOM_SEEDS[0])
    base_model_optuna_study_skf_splits = base_model_optuna_study_skf.split(train_data.drop(target_col, axis=1), train_data[target_col])
    base_model_optuna_study_skf_enumeration = enumerate(base_model_optuna_study_skf_splits)

    total_roc_auc = 0

    for fold, (train_indices, validation_indices) in base_model_optuna_study_skf_enumeration:
        X_train_fold = train_data.drop(target_col, axis=1).iloc[train_indices]
        X_validation_fold = train_data.drop(target_col, axis=1).iloc[validation_indices]
        y_train_fold = train_data[target_col].iloc[train_indices]
        y_validation_fold = train_data[target_col].iloc[validation_indices]

        y_validation_pred_proba = get_base_model_predictions(
            BASE_MODEL_OPTUNA_STUDY_ESTIMATOR,
            base_model_params,
            X_train_fold, y_train_fold,
            X_validation_fold, y_validation_fold
        )
        roc_auc_fold = roc_auc_score(y_validation_fold, y_validation_pred_proba)
        total_roc_auc += roc_auc_fold

        trial.report(roc_auc_fold, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    average_roc_auc = total_roc_auc / BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS
    return average_roc_auc

In [23]:
if SKIP_BASE_MODEL_HYPERPARAMETER_TUNING:
    print("Skipped base model hyperparameter tuning")
else:
    print(f"Started base model hyperparameter tuning for {BASE_MODEL_OPTUNA_STUDY_ESTIMATOR.value}")
    sampler = optuna.samplers.TPESampler(n_ei_candidates=24, multivariate=True)
    study = optuna.create_study(sampler=sampler, direction="maximize")
    study.optimize(base_model_optuna_study_objective, n_trials=BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS)
    
    print(f"# trials finished: {len(study.trials)}")
    trial = study.best_trial
    print(f"Best trial AUC: {trial.value}")
    print(f"Best trial params:")
    for param_key, param_value in trial.params.items():
        print(f"- {param_key}: {param_value}")

Skipped base model hyperparameter tuning


# 7. Base Models

In [24]:
# number of splits to use for Stratified K-Fold Cross-Validation for base models
BASE_MODEL_KFOLD_NUM_SPLITS = 5

## 7.1 CatBoost

### 7.1.1 Helper Methods (CatBoost)

In [25]:
def get_catboost_preds(params_dict, feature_names):
    oof_preds = np.zeros(len(train_data))
    all_test_preds_total = np.zeros(len(test_data))

    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        skf_splits = skf.split(train_data.drop(target_col, axis=1), train_data[target_col])
        skf_enumeration = enumerate(skf_splits)
    
        for fold, (train_indices, validation_indices) in skf_enumeration:
            X_train_fold = train_data.drop(target_col, axis=1).iloc[train_indices]
            X_validation_fold = train_data.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data[target_col].iloc[train_indices]
            y_validation_fold = train_data[target_col].iloc[validation_indices]
        
            model = CatBoostClassifier(
                iterations=params_dict['iterations'],
                learning_rate=params_dict['learning_rate'],
                depth=params_dict['depth'],
                l2_leaf_reg=params_dict['l2_leaf_reg'],
                bagging_temperature=params_dict['bagging_temperature'],
                random_strength=params_dict['random_strength'],
                min_data_in_leaf=params_dict['min_data_in_leaf'],
                od_type='Iter',
                od_wait=50,
                use_best_model=True,
                cat_features=cat_features,
                eval_metric='AUC',
                task_type='GPU' if torch.cuda.is_available() else 'CPU',
                devices='0',
                metric_period=1000,
                random_seed=random_seed,
                verbose=False,
                allow_writing_files=False
            )
        
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=(X_validation_fold, y_validation_fold),
                early_stopping_rounds=50
            )
            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            oof_preds[validation_indices] = np.array(y_validation_pred_proba)
        
            y_test_pred_proba = model.predict_proba(test_data)[:, 1]
            all_test_preds_total += np.array(y_test_pred_proba)

    test_preds = all_test_preds_total / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    return pd.Series(oof_preds), pd.Series(test_preds)

def get_catboost_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"CatBoost_{index}",
        params_dict=params_dict,
        feature_names=train_data.columns.tolist(),
        get_preds=get_catboost_preds
    )

### 7.1.2 Add Estimators (CatBoost)

Add CatBoost estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [26]:
estimators += [
    get_catboost_stacking_estimator(
        index=1,
        params_dict={ # Optuna study AUC: 0.725842155230371
            'iterations': 5000,
            'learning_rate': 0.041779205681346576,
            'depth': 4,
            'l2_leaf_reg': 3.628892496718331,
            'bagging_temperature': 0.1922242909320177,
            'random_strength': 8.464699585881778,
            'min_data_in_leaf': 5,
        }
    ),
     get_catboost_stacking_estimator(
        index=2,
        params_dict={ # Optuna study AUC: 0.7257614687804782
            'iterations': 5000,
            'learning_rate': 0.08955773312600926,
            'depth': 4,
            'l2_leaf_reg': 8.952470035979275,
            'bagging_temperature': 0.21150772067613666,
            'random_strength': 14.741499198080962,
            'min_data_in_leaf': 1,
        }
    ),
]

## 7.2 XGBClassifier

### 7.2.1 Helper Methods (XGBClassifier)

In [27]:
def get_xgbclassifier_preds(params_dict, feature_names):
    oof_preds = np.zeros(len(train_data))
    all_test_preds_total = np.zeros(len(test_data))

    for random_seed in RANDOM_SEEDS:
        skf = StratifiedKFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        skf_splits = skf.split(train_data.drop(target_col, axis=1), train_data[target_col])
        skf_enumeration = enumerate(skf_splits)

        for fold, (train_indices, validation_indices) in skf_enumeration:
            X_train_fold = train_data.drop(target_col, axis=1).iloc[train_indices]
            X_validation_fold = train_data.drop(target_col, axis=1).iloc[validation_indices]
            y_train_fold = train_data[target_col].iloc[train_indices]
            y_validation_fold = train_data[target_col].iloc[validation_indices]

            model = XGBClassifier(
                n_estimators=params_dict['n_estimators'],
                learning_rate=params_dict['learning_rate'],
                max_depth=params_dict['max_depth'],
                subsample=params_dict['subsample'],
                colsample_bytree=params_dict['colsample_bytree'],
                alpha=params_dict['alpha'],
                gamma=params_dict['gamma'],
                reg_lambda=params_dict['lambda'],
                min_child_weight=params_dict['min_child_weight'],
                tree_method='gpu_hist' if torch.cuda.is_available() else 'auto',
                predictor='gpu_predictor' if torch.cuda.is_available() else 'cpu_predictor',
                device='cuda' if torch.cuda.is_available() else 'cpu',
                enable_categorical=True,
                eval_metric='auc',
                n_jobs=-1,
                random_state=random_seed,
                verbosity=0
            )
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_validation_fold, y_validation_fold)],
                early_stopping_rounds=50,
                verbose=False
            )
    
            y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
            oof_preds[validation_indices] = np.array(y_validation_pred_proba)
        
            y_test_pred_proba = model.predict_proba(test_data)[:, 1]
            all_test_preds_total += np.array(y_test_pred_proba)

    test_preds = all_test_preds_total / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    return pd.Series(oof_preds), pd.Series(test_preds)

def get_xgbclassifier_stacking_estimator(index, params_dict):
    return StackingEstimator(
        name=f"XGBClassifier_{index}",
        params_dict=params_dict,
        feature_names=train_data.columns.tolist(),
        get_preds=get_xgbclassifier_preds
    )

### 7.2.2 Add Estimators (XGBClassifier)

Add XGBClassifier estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [28]:
estimators += [
    get_xgbclassifier_stacking_estimator(
        index=1,
        params_dict={ # Optuna study AUC: 0.7273817150393508
            'n_estimators': 5000,
            'learning_rate': 0.047179227853488916,
            'max_depth': 3,
            'subsample': 0.9561594029099818,
            'colsample_bytree': 0.5200809916944509,
            'alpha': 9.323686821094613,
            'gamma': 0.06513704074541844,
            'lambda': 0.07573405175712218,
            'min_child_weight': 14,
        }
    ),
    get_xgbclassifier_stacking_estimator(
        index=2,
        params_dict={ # Optuna study AUC: 0.7274144144696422
            'n_estimators': 5000,
            'learning_rate': 0.06778303256075534,
            'max_depth': 3,
            'subsample': 0.9750702612583769,
            'colsample_bytree': 0.5164463777572837,
            'alpha': 6.677223824702266,
            'gamma': 0.06627215758548254,
            'lambda': 0.10239210156952944,
            'min_child_weight': 17,
        }
    ),
]

# 8. Base Model Predictions

## 8.1 Get Base Model Predictions

In [29]:
stacking_preds_retriever = StackingPredictionsRetriever(
    estimators=estimators,
    working_dir_path="/kaggle/working/",
    train_preds_filename="base_models_train_preds",
    test_preds_filename="base_models_test_preds",
    preds_save_interval=1
)
stacking_preds_retriever.import_preds("/kaggle/input/diabetes-prediction-challenge-base-model-preds/")
stacking_preds_retriever.sync_preds()
stacking_preds_retriever.get_preds()

base_model_train_preds, base_model_test_preds = stacking_preds_retriever.get_current_train_and_test_preds()
base_model_train_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))
base_model_test_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))

[INFO] Importing predictions..
[INFO] 3 train predictions were imported:
CatBoost_1 (7d0ae66db5ef1214c16dd78e84b3b8b6), CatBoost_2 (b8f1c39dd2e3eb94823bc2d0dec6ab9b), XGBClassifier_1 (9005143af2a6fd0d72e235e429ebcb70)
[INFO] 3 test predictions were imported:
CatBoost_1 (7d0ae66db5ef1214c16dd78e84b3b8b6), CatBoost_2 (b8f1c39dd2e3eb94823bc2d0dec6ab9b), XGBClassifier_1 (9005143af2a6fd0d72e235e429ebcb70)
[INFO] Finished importing predictions
[INFO] Syncing predictions..
[INFO] No columns for training predictions were dropped
[INFO] No columns for test predictions were dropped
[INFO] Finished syncing predictions
[INFO] Getting predictions..
[INFO] Getting predictions for estimator XGBClassifier_2 (ffd11b3c26b03e8e9386ea331f8311b4)
[INFO] Saved predictions
[INFO] Skipped retrieving predictions for following estimators as their current ones are not stale:
CatBoost_1 (7d0ae66db5ef1214c16dd78e84b3b8b6), CatBoost_2 (b8f1c39dd2e3eb94823bc2d0dec6ab9b), XGBClassifier_1 (9005143af2a6fd0d72e235e429eb

## 8.2 Base Models AUC

In [30]:
base_model_auc = pd.Series()
for estimator in base_model_train_preds.columns:
    base_model_auc[estimator] = roc_auc_score(train_data[target_col], base_model_train_preds[estimator])
base_model_auc.sort_values()

CatBoost_1 (7d0ae66db5ef1214c16dd78e84b3b8b6)         0.726079
CatBoost_2 (b8f1c39dd2e3eb94823bc2d0dec6ab9b)         0.726310
XGBClassifier_1 (9005143af2a6fd0d72e235e429ebcb70)    0.727542
XGBClassifier_2 (ffd11b3c26b03e8e9386ea331f8311b4)    0.727581
dtype: float64

# 9. Meta-Model

In [31]:
# number of splits to use for K-Fold Cross-Validation for meta model
META_MODEL_KFOLD_NUM_SPLITS = 5

In [32]:
meta_oof_preds = np.zeros(len(train_data))
meta_test_preds_total = np.zeros(len(test_data))
meta_train_feature_importances_total = np.zeros(len(base_model_train_preds.columns))

kfold = KFold(n_splits=META_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=RANDOM_SEEDS[0])
kfold_splits = kfold.split(train_data.drop(target_col, axis=1), train_data[target_col])
kfold_enumeration = enumerate(kfold_splits)

for fold, (train_indices, validation_indices) in kfold_enumeration:
    X_train_fold = base_model_train_preds.iloc[train_indices]
    X_validation_fold = base_model_train_preds.iloc[validation_indices]
    y_train_fold = train_data[target_col].iloc[train_indices]
    y_validation_fold = train_data[target_col].iloc[validation_indices]

    model = LogisticRegression()
    model.fit(X_train_fold, y_train_fold)
    y_validation_pred_proba = model.predict_proba(X_validation_fold)[:, 1]
    meta_oof_preds[validation_indices] = np.array(y_validation_pred_proba)

    meta_train_feature_importances_total += np.array(model.coef_[0])

    y_test_pred_proba = model.predict_proba(base_model_test_preds)[:, 1]
    meta_test_preds_total += np.array(y_test_pred_proba)

meta_train_feature_importances = meta_train_feature_importances_total / META_MODEL_KFOLD_NUM_SPLITS
meta_test_preds = meta_test_preds_total / META_MODEL_KFOLD_NUM_SPLITS

## 9.3 Meta-Model Feature Importances

In [33]:
meta_model_feature_importances = pd.Series(meta_train_feature_importances)
meta_model_feature_importances.index = base_model_train_preds.columns
meta_model_feature_importances.sort_values(ascending=False)

XGBClassifier_2 (ffd11b3c26b03e8e9386ea331f8311b4)    2.029517
XGBClassifier_1 (9005143af2a6fd0d72e235e429ebcb70)    1.920674
CatBoost_2 (b8f1c39dd2e3eb94823bc2d0dec6ab9b)         1.212187
CatBoost_1 (7d0ae66db5ef1214c16dd78e84b3b8b6)        -0.237199
dtype: float64

## 9.4 Meta-Model AUC

In [34]:
meta_model_auc = roc_auc_score(train_data[target_col], meta_oof_preds)
meta_model_auc

0.7277027093247483

# 10. Submission

In [35]:
# prepare submission
submission = pd.DataFrame({'id': test_data.index, target_col: meta_test_preds})
submission.to_csv('submission.csv', index=False)
print('Submission file prepared.')

Submission file prepared.
