# 1. Overview

This is a notebook for training models to submit predictions to the "Predicting Student Test Scores" Kaggle competition ([playground-series-s6e1](https://www.kaggle.com/competitions/playground-series-s6e1)).

Synthetic data is used for this playground competition, and the objective is to, for each student in the test set, predict a probability for the exam_score variable.

# 2. Setup

## 2.1 Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import copy
import lightgbm as lgb
import optuna
import os
import hashlib as hl # for StackingEstimator
import inspect # for StackingEstimator
import random
import warnings
from catboost import CatBoostRegressor
from enum import Enum
from pathlib import Path # for StackingPredictionsRetriever
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from types import FunctionType
from xgboost import XGBRegressor

warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

pd.set_option('display.max_colwidth', None) # Display full column content
pd.set_option('display.max_rows', None) # Display all rows
pd.set_option('display.width', 1000) # Set larger display width

  if entities is not ():


## 2.2 Reproducibility

In [2]:
RANDOM_SEEDS = [2, 11, 42, 99, 121]
random.seed(RANDOM_SEEDS[0])
np.random.seed(RANDOM_SEEDS[0])
torch.manual_seed(RANDOM_SEEDS[0])
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEEDS[0])
    torch.cuda.manual_seed_all(RANDOM_SEEDS[0])

## 2.3 Device

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 2.4 DataFrames

Read the data provided for the competition into dataframes.

In [4]:
INPUT_DIR = '/kaggle/input'
orig_train_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s6e1/train.csv'))
orig_test_data = pd.read_csv(os.path.join(INPUT_DIR, 'playground-series-s6e1/test.csv'))

# set index
orig_train_data.set_index('id', inplace=True)
orig_test_data.set_index('id', inplace=True)

# target column
target_col = "exam_score"

# 3. Exploratory Data Analysis

In [5]:
# to skip the generation of plots (e.g. KDE) in this section that take time; set to False to generate the plots 
SKIP_PLOTS = True

In [6]:
orig_train_data.describe()

Unnamed: 0,age,study_hours,class_attendance,sleep_hours,exam_score
count,630000.0,630000.0,630000.0,630000.0,630000.0
mean,20.545821,4.002337,71.987261,7.072758,62.506672
std,2.260238,2.35988,17.430098,1.744811,18.916884
min,17.0,0.08,40.6,4.1,19.599
25%,19.0,1.97,57.0,5.6,48.8
50%,21.0,4.0,72.6,7.1,62.6
75%,23.0,6.05,87.2,8.6,76.3
max,24.0,7.91,99.4,9.9,100.0


In [7]:
orig_test_data.describe()

Unnamed: 0,age,study_hours,class_attendance,sleep_hours
count,270000.0,270000.0,270000.0,270000.0
mean,20.544137,4.003878,71.982509,7.07207
std,2.260452,2.357741,17.414695,1.745513
min,17.0,0.08,40.6,4.1
25%,19.0,1.98,57.0,5.6
50%,21.0,4.01,72.6,7.1
75%,23.0,6.05,87.2,8.6
max,24.0,7.91,99.4,9.9


In [8]:
numeric_col_names = orig_train_data.select_dtypes(include='number').columns.to_series()
categorical_col_names = orig_train_data.select_dtypes(include='object').columns.to_series()
assert numeric_col_names.size + categorical_col_names.size == orig_train_data.shape[1]

# drop target column from numeric column names
numeric_col_names.drop(target_col, inplace=True)

In [9]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} missing values #####")
    print(dataset.isnull().sum())
    print()

##### Train data missing values #####
age                 0
gender              0
course              0
study_hours         0
class_attendance    0
internet_access     0
sleep_hours         0
sleep_quality       0
study_method        0
facility_rating     0
exam_difficulty     0
exam_score          0
dtype: int64

##### Test data missing values #####
age                 0
gender              0
course              0
study_hours         0
class_attendance    0
internet_access     0
sleep_hours         0
sleep_quality       0
study_method        0
facility_rating     0
exam_difficulty     0
dtype: int64



In [10]:
for (dataset_name, dataset) in [('Train data', orig_train_data), ('Test data', orig_test_data)]:
    print(f"##### {dataset_name} categorical cols unique values #####")
    for categorical_col_name in categorical_col_names:
        print()
        print(f"[{categorical_col_name}]")
        counts = dataset[categorical_col_name].value_counts(normalize=True).mul(100).round(2).astype(str).add('%')
        print(counts.reset_index(name='counts').to_string(index=False, header=False))
    print()

##### Train data categorical cols unique values #####

[gender]
 other 33.51%
  male 33.43%
female 33.07%

[course]
 b.tech 20.83%
   b.sc 17.71%
  b.com 17.61%
    bca 14.08%
    bba 12.01%
     ba  9.84%
diploma  7.92%

[internet_access]
yes 91.97%
 no  8.03%

[sleep_quality]
   poor 33.92%
   good 33.82%
average 32.26%

[study_method]
     coaching  20.9%
   self-study 20.81%
        mixed 19.54%
  group study 19.53%
online videos 19.22%

[facility_rating]
medium 33.98%
   low 33.71%
  high 32.31%

[exam_difficulty]
moderate 56.19%
    easy 28.02%
    hard 15.79%

##### Test data categorical cols unique values #####

[gender]
  male 33.59%
 other 33.37%
female 33.04%

[course]
 b.tech 20.91%
   b.sc 17.67%
  b.com 17.53%
    bca 14.11%
    bba 11.89%
     ba  9.95%
diploma  7.94%

[internet_access]
yes 92.1%
 no  7.9%

[sleep_quality]
   good 33.98%
   poor 33.96%
average 32.06%

[study_method]
     coaching  21.0%
   self-study 20.72%
  group study 19.55%
        mixed 19.52%
onlin

In [11]:
# KDE plots of target variable and numerical features (train data)
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 24))
    kdeplot_col_names = [target_col]
    kdeplot_col_names.extend(numeric_col_names)
    for i, col in enumerate(kdeplot_col_names, start=1):
        plt.subplot(10, 2, i)
        sns.kdeplot(data=orig_train_data, x=col, fill=True)
        plt.tight_layout()
        plt.title(f"KDE plot of {col}")

In [12]:
# KDE plots of numerical features (test data)
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 24))
    for i, col in enumerate(numeric_col_names, start=1):
        plt.subplot(10, 2, i)
        sns.kdeplot(data=orig_test_data, x=col, fill=True)
        plt.tight_layout()
        plt.title(f"KDE plot of {col}")

In [13]:
if not SKIP_PLOTS:
    plt.figure(figsize=(12, 8))
    corr_matrix_col_names = [target_col]
    corr_matrix_col_names.extend(numeric_col_names)
    sns.heatmap(
        orig_train_data[corr_matrix_col_names].corr(),
        cmap='Reds',
        annot=True,
        linewidths=2,
        fmt='.2f',
        vmin=-1,
        vmax=1
    )
    plt.title('Correlation Matrix of Numeric Features and Target Variable', fontsize=18, pad=20)
    plt.show()

# 4. Data Pre-Processing

In [14]:
train_data = orig_train_data.copy()
test_data = orig_test_data.copy()

## 4.1 Ordinal Encoding

In [15]:
ordinal_mappings = {
    'sleep_quality':   {'poor': 0, 'average': 1, 'good': 2},
    'facility_rating': {'low': 0, 'medium': 1, 'high': 2},
    'exam_difficulty': {'easy': 0, 'moderate': 1, 'hard': 2},
    'internet_access': {'no': 0, 'yes': 1}
}
for col, mapping in ordinal_mappings.items():
    train_data[col] = train_data[col].map(mapping)
    test_data[col]  = test_data[col].map(mapping)

## 4.2 Feature Engineering

In [16]:
def add_generated_features(df, train_df_reference=None):
    # to avoid data leakage by using means/clusters from train data when processing test data
    if train_df_reference is None:
        train_df_reference = df

    # polynomial
    for col in ['study_hours', 'class_attendance']:
        df[f'{col}_sq'] = df[col] ** 2
    df['sleep_parabola'] = (df['sleep_hours'] - 7.5) ** 2

    # log
    for col in ['study_hours']:
        df[f'{col}_log'] = np.log1p(df[col])

    # interactions
    df['study_hours_x_class_attendance'] = df['study_hours'] * df['class_attendance'] # total dedication
    df['study_hours_x_sleep_hours'] = df['study_hours'] * df['sleep_hours']
    df['study_hours_x_sleep_quality'] = df['study_hours'] * df['sleep_quality']
    df['study_hours_x_exam_difficulty'] = df['study_hours'] * df['exam_difficulty']
    df['study_hours_x_internet_access'] = df['study_hours'] * df['internet_access']
    df['sleep_hours_x_sleep_quality'] = df['sleep_hours'] * df['sleep_quality']
    df['class_attendance_x_exam_difficulty'] = df['class_attendance'] * df['exam_difficulty']
    df['class_attendance_x_facility_rating'] = df['class_attendance'] * df['facility_rating']

    # ratios
    df['study_hours_per_class_attendance'] = df['study_hours'] / (df['class_attendance'] + 1e-5)

    # group-based relative features
    difficulty_means = train_df_reference.groupby('exam_difficulty')['study_hours'].mean()
    sleep_attendance_means = train_df_reference.groupby('sleep_quality')['class_attendance'].mean()
    sleep_means = train_df_reference.groupby('sleep_quality')['sleep_hours'].mean()
    df['study_hours_vs_difficulty_mean'] = df['study_hours'] - df['exam_difficulty'].map(difficulty_means)
    df['attendance_vs_sleep_mean'] = df['class_attendance'] - df['sleep_quality'].map(sleep_attendance_means)
    df['sleep_hours_vs_quality_mean'] = df['sleep_hours'] - df['sleep_quality'].map(sleep_means)

    # composite indices
    df['resource_score'] = df['facility_rating'] + df['internet_access']
    df['rest_index'] = df['sleep_quality'] + (df['sleep_hours'] / 2.0)
    df['total_advantage_index'] = df['facility_rating'] + df['internet_access'] + df['sleep_quality']

    # unsupervised clustering
    kmeans_cols = ['study_hours', 'class_attendance', 'sleep_hours']
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(train_df_reference[kmeans_cols])
    kmeans = KMeans(n_clusters=5, random_state=RANDOM_SEEDS[0], n_init=10)
    kmeans.fit(scaled_data)
    scaled_current = scaler.transform(df[kmeans_cols])
    df['student_profile_cluster'] = kmeans.predict(scaled_current)
    

In [17]:
# add generated features
add_generated_features(train_data)
add_generated_features(test_data, train_df_reference=train_data)

In [18]:
train_data.columns

Index(['age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty', 'exam_score', 'study_hours_sq', 'class_attendance_sq', 'sleep_parabola', 'study_hours_log', 'study_hours_x_class_attendance', 'study_hours_x_sleep_hours', 'study_hours_x_sleep_quality', 'study_hours_x_exam_difficulty', 'study_hours_x_internet_access', 'sleep_hours_x_sleep_quality', 'class_attendance_x_exam_difficulty', 'class_attendance_x_facility_rating', 'study_hours_per_class_attendance', 'study_hours_vs_difficulty_mean', 'attendance_vs_sleep_mean', 'sleep_hours_vs_quality_mean', 'resource_score', 'rest_index', 'total_advantage_index', 'student_profile_cluster'], dtype='object')

In [19]:
test_data.columns

Index(['age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty', 'study_hours_sq', 'class_attendance_sq', 'sleep_parabola', 'study_hours_log', 'study_hours_x_class_attendance', 'study_hours_x_sleep_hours', 'study_hours_x_sleep_quality', 'study_hours_x_exam_difficulty', 'study_hours_x_internet_access', 'sleep_hours_x_sleep_quality', 'class_attendance_x_exam_difficulty', 'class_attendance_x_facility_rating', 'study_hours_per_class_attendance', 'study_hours_vs_difficulty_mean', 'attendance_vs_sleep_mean', 'sleep_hours_vs_quality_mean', 'resource_score', 'rest_index', 'total_advantage_index', 'student_profile_cluster'], dtype='object')

## 4.4 Remaining Categorical Features

In [20]:
cat_features = train_data.drop(target_col, axis=1).select_dtypes(include='object').columns.to_list()
if len(cat_features) > 0:
    for col in cat_features:
        train_data[col] = train_data[col].astype('category')
        test_data[col] = test_data[col].astype('category')
cat_features

['gender', 'course', 'study_method']

# 5. Stacking Initial Setup

We'll use stacking, an [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) strategy, to generate the predictions. As we'll need to gather predictions from various base models (a.k.a. level-0 models) to feed as input features to a meta model (a.k.a. level-1 model), in order to streamline the process of experimenting with different combinations of base models, some helper classes will be defined in this section. These classes can also be found [here](https://github.com/chuo-v/machine-learning-utils/blob/master/ensemble-learning/stacking/stacking_predictions_retriever.py) at one of my GitHub repositories used to organize some utilities I implemented for machine learning.

In [21]:
class StackingEstimator:
    """
    A class representing an estimator that will be used for stacking, an ensemble learning strategy.

    Intended to be used in conjunction with the `StackingPredictionsRetriever` class, which helps
    retrieve predictions for multiple instances of `StackingEstimator`; as the predictions are saved
    in files, on subsequent requests to retrieve predictions, even as the set of estimators has been
    modified, the `StackingPredictionsRetriever` class can determine the predictions of estimators
    that are non-stale and available (if any) by using the `get_hash` method of the `StackingEstimator`
    class to determine the relevance and staleness of any saved predictions.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimator are determinstic, i.e. they are exactly the same everytime the estimator is
    run with the same inputs (`name`, `params_dict`, `feature_names`, `get_predictions`).
    """
    name = ""
    params_dict = {}
    feature_names = []
    get_predictions = lambda: None

    def __init__(
        self,
        name: str,
        feature_names: [str],
        params_dict: {},
        get_preds: FunctionType
    ):
        """
        Initializes a new instance of `StackingEstimator`.

        :param name:
            A string representing a name for the estimator. It is used for the column names of
            the training and test predictions for each estimator, and is also used as an input
            to calculate a hash value for the estimator. It is recommended to use a different
            name from the names used for other estimators passed to `StackingPredictionsRetriever`.
        :param feature_names:
            A list of strings representing the names of the features that will be used for the
            estimator. It will be passed as an argument to `get_preds`. Internally, it is only
            used as an input to calculate a hash value for the estimator.
        :param params_dict:
            A dictionary of parameters that will be specified for the estimator. It will be
            passed as an argument to `get_preds`. Internally, it is only used as an input
            to calculate a hash value for the estimator.
        :param get_preds:
            A function for getting the predictions for the estimator. It should only take two
            arguments: 'params_dict' and 'feature_names', and should return predictions for
            the training and test data (in that order) as a tuple of two `pandas.Series`.
        """
        # parameter check
        if not isinstance(name, str):
            raise ValueError("`name` argument should be of type `str`")
        if not isinstance(feature_names, list):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should be of type `list`")
        elif not all(isinstance(feature_name, str) for feature_name in feature_names):
            raise ValueError(f"`feature_names` argument for estimator \"{name}\" should only contain instances of `str`")
        if not isinstance(params_dict, dict):
            raise ValueError(f"`params_dict` argument for estimator \"{name}\" should be of type `dict`")
        get_preds_params = inspect.signature(get_preds).parameters.values()
        get_preds_param_names = [param.name for param in get_preds_params]
        if len(get_preds_param_names) != 2:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take two arguments")
        elif "params_dict" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"params_dict\" argument")
        elif "feature_names" not in get_preds_param_names:
            raise ValueError(f"`get_preds` function for estimator \"{name}\" should take a \"feature_names\" argument")

        self.name = name
        self.feature_names = feature_names
        self.params_dict = params_dict
        self.get_preds = get_preds

    def get_hash_value(self):
        """
        Calculates and returns a hash value for the estimator using
        `name`, `feature_names` and `params_dict` as inputs.
        """
        feature_names_str = "_".join(sorted(self.feature_names))
        params_dict_str = "_".join(f"{key}-{value}" for (key, value) in sorted(self.params_dict.items()))
        hash_input_str = "_".join([self.name, feature_names_str, params_dict_str])
        md5_hash = hl.md5(hash_input_str.encode('utf-8')).hexdigest()
        return md5_hash

class StackingPredictionsRetriever:
    """
    A class for streamlining stacking (an ensemble learning strategy) that saves predictions
    from estimators to file so that when trying out different combinations of (base) estimators,
    the predictions that are not stale can be reused, saving the time of having the estimators
    make predictions again.

    Intended to be used in conjunction with the `StackingEstimator` class. The `hash_value` of
    `StackingEstimator` is used to determine the staleness and relevance of the predictions for
    an estimator. The implementation for making predictions using an estimator needs to be
    provided as a function to `get_preds` for `StackingEstimator`; when predictions need to be
    made using an estimator, this class will call `get_preds` for the `StackingEstimator` instance.

    Proper usage of this class requires one important condition to be satisfied: the predictions made
    using the estimators are determinstic, i.e. they are exactly the same everytime a
    `StackingEstimator` instance is run with the same inputs.
    """
    estimators = []
    working_dir_path = ""
    train_preds_filename = ""
    test_preds_filename = ""
    preds_save_interval = 0

    def __init__(
        self,
        estimators: [StackingEstimator],
        working_dir_path: str,
        train_preds_filename: str = "train_preds",
        test_preds_filename: str = "test_preds",
        preds_save_interval: int = 5,
    ):
        """
        Initializes a new instance of `StackingPredictionsRetriever`.

        :param estimators:
            A list of `StackingEstimator` instances for which the class will retrieve predictions.
        :param working_dir_path:
            The path for the working directory where the files with predictions will be saved.
        :param train_preds_filename:
            The name of the file in which predictions for the training set will be stored.
        :param test_preds_filename:
            The name of the file in which predictions for the test set will be stored.
        :param preds_save_interval:
            An integer which specifies the interval at which predictions will be saved when
            `get_preds` is called, corresponding to the number of estimators whose predictions
            have been retrieved since the predictions were previously saved. Any estimators
            whose predictions are not stale and therefore were not required to make predictions
            again are not included in this number.
        """
        # parameter check
        if not isinstance(estimators, list):
            raise ValueError("`estimators` must be passed as a list")
        if not all(isinstance(e, StackingEstimator) for e in estimators):
            raise ValueError("`estimators` should only contain instances of `StackingEstimator`")
        if not isinstance(working_dir_path, str):
            raise ValueError("`working_dir_path` argument should be of type `str`")
        if not isinstance(preds_save_interval, int):
            raise ValueError("`preds_save_interval` argument should be of type `int`")

        self.estimators = estimators
        self.working_dir_path = working_dir_path
        self.train_preds_filename = train_preds_filename
        self.test_preds_filename = test_preds_filename
        self.preds_save_interval = preds_save_interval

    def get_train_preds_file_path(self):
        """
        Returns the file path for storing predictions for training data.
        """
        return Path(f"{self.working_dir_path}/{self.train_preds_filename}.csv")

    def get_test_preds_file_path(self):
        """
        Returns the file path for storing predictions for test data.
        """
        return Path(f"{self.working_dir_path}/{self.test_preds_filename}.csv")

    def get_current_train_and_test_preds(self):
        """
        Returns the current predictions for training and test data (in that order)
        as a tuple of two `pandas.DataFrame`.

        The predictions are attempted to be retrieved from the file paths returned
        by `get_train_preds_file_path` and `get_test_preds_file_path`; if there are
        any issues with doing so (e.g. file does not exist, dataframe is empty),
        empty dataframes will be returned instead.
        In the case an `pandas.errors.EmptyDataError` exception is raised when
        reading from a file, the corresponding file will be removed.
        """
        curr_train_preds = pd.DataFrame()
        curr_test_preds = pd.DataFrame()
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            try:
                curr_train_preds = pd.read_csv(train_preds_file_path)
            except pd.errors.EmptyDataError:
                train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            try:
                curr_test_preds = pd.read_csv(test_preds_file_path)
            except pd.errors.EmptyDataError:
                test_preds_file_path.unlink()

        return curr_train_preds, curr_test_preds

    def get_preds(self):
        """
        Retrieves predictions from all estimators in `estimators`, storing them in
        two files at the file paths specified by `working_dir_path`,
        `train_preds_filename` and `test_preds_filename`.

        If non-stale (relevant) predictions are found for an estimator, retrieval
        of predictions by calling `get_preds` on the estimator will be skipped,
        and the existing predictions for the estimator will be kept.
        """
        print("[INFO] Getting predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        preds_retrieved_count = 0
        num_preds_retrieved_but_not_yet_saved = 0
        estimators_skipped = []

        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"

            # skip retrieving predictions for estimator if non-stale predictions are already available
            train_preds_available = any(estimator_hash_value in col_name for col_name in curr_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in curr_test_preds.columns)
            if train_preds_available and test_preds_available:
                estimators_skipped += [estimator_name]
                continue

            print(f"[INFO] Getting predictions for estimator {estimator_name}")
            train_preds, test_preds = estimator.get_preds(estimator.params_dict, estimator.feature_names)
            if not isinstance(train_preds, pd.core.series.Series):
                raise ValueError("`train_preds` should be of type `pandas.Series`")
            if not isinstance(test_preds, pd.core.series.Series):
                raise ValueError("`test_preds` should be of type `pandas.Series`")
            curr_train_preds[estimator_name] = train_preds
            curr_test_preds[estimator_name] = test_preds
            preds_retrieved_count += 1

            # save predictions at an interval of `preds_save_interval`
            if preds_retrieved_count % self.preds_save_interval == 0:
                curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
                curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
                num_preds_retrieved_but_not_yet_saved = 0
                print("[INFO] Saved predictions")
            else:
                num_preds_retrieved_but_not_yet_saved += 1

        if estimators_skipped:
            estimators_skipped.sort()
            formatted_estimators = ", ".join(estimators_skipped)
            print(f"[INFO] Skipped retrieving predictions for following estimators as their current ones are not stale:\n{formatted_estimators}")

        if num_preds_retrieved_but_not_yet_saved != 0:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            print("[INFO] Saved predictions")

        print("[INFO] Finished getting all predictions")

    def sync_preds(self):
        """
        Syncs the predictions stored at the two file paths specified by
        `working_dir_path`, `train_preds_filename` and `test_preds_filename` by
        removing predictions for any estimator that is not currently in `estimators`.

        Note that new predictions for estimators that do not currently have predictions
        in the files will not be added; `get_preds` should be used for this purpose
        instead.
        """
        print("[INFO] Syncing predictions..")
        estimator_hash_values = [estimator.get_hash_value() for estimator in self.estimators]
        should_remove_col = lambda col_name: not any(hash_value in col_name for hash_value in estimator_hash_values)

        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()

        if not curr_train_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_train_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from training predictions:\n{col_names_to_remove}")
                curr_train_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_train_preds.to_csv(self.get_train_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for training predictions were dropped")
        if not curr_test_preds.empty:
            col_names_to_remove = [col_name for col_name in curr_test_preds.columns if should_remove_col(col_name)]
            if col_names_to_remove:
                print(f"[INFO] Dropping columns for following estimators from test predictions:\n{col_names_to_remove}")
                curr_test_preds.drop(columns=col_names_to_remove, inplace=True)
                curr_test_preds.to_csv(self.get_test_preds_file_path(), index=False)
            else:
                print(f"[INFO] No columns for test predictions were dropped")

        print("[INFO] Finished syncing predictions")

    def import_preds(self, input_dir_path):
        """
        Imports predictions stored at the two file paths at `input_dir_path` with
        `train_preds_filename` and `test_preds_filename` as their filenames. If no
        such files are found, no predictions will be imported.

        Only predictions for estimators specified in `estimators` will be imported.
        Any predictions for estimators that were already available will be overwritten
        with predictions for the same estimators found in the files at `input_dir_path`.

        :param input_dir_path:
            The path to the directory for the training and test predictions files.
            The file names are expected to be the same as `train_preds_filename`
            and `test_preds_filename`
        """
        print("[INFO] Importing predictions..")
        curr_train_preds, curr_test_preds = self.get_current_train_and_test_preds()
        input_train_preds = pd.DataFrame()
        input_test_preds = pd.DataFrame()

        input_train_preds_path = Path(f"{input_dir_path}/{self.train_preds_filename}.csv")
        input_test_preds_path = Path(f"{input_dir_path}/{self.test_preds_filename}.csv")
        if input_train_preds_path.is_file():
            try:
                input_train_preds = pd.read_csv(input_train_preds_path)
            except: pass
        if input_test_preds_path.is_file():
            try:
                input_test_preds = pd.read_csv(input_test_preds_path)
            except: pass

        estimators_with_imported_train_preds = []
        estimators_with_imported_test_preds = []
        for estimator in self.estimators:
            estimator_hash_value = estimator.get_hash_value()
            estimator_name = f"{estimator.name} ({estimator_hash_value})"
            train_preds_available = any(estimator_hash_value in col_name for col_name in input_train_preds.columns)
            test_preds_available = any(estimator_hash_value in col_name for col_name in input_test_preds.columns)

            if train_preds_available:
                curr_train_preds[estimator_name] = input_train_preds[estimator_name]
                estimators_with_imported_train_preds += [estimator_name]
            if test_preds_available:
                curr_test_preds[estimator_name] = input_test_preds[estimator_name]
                estimators_with_imported_test_preds += [estimator_name]

        if not estimators_with_imported_train_preds:
            print("[INFO] No train predictions were imported")
        else:
            curr_train_preds.sort_index(axis=1).to_csv(self.get_train_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_train_preds)
            print(f"[INFO] {len(estimators_with_imported_train_preds)} train predictions were imported:\n{formatted_estimators}")
        if not estimators_with_imported_test_preds:
            print("[INFO] No test predictions were imported")
        else:
            curr_test_preds.sort_index(axis=1).to_csv(self.get_test_preds_file_path(), index=False)
            formatted_estimators = ", ".join(estimators_with_imported_test_preds)
            print(f"[INFO] {len(estimators_with_imported_test_preds)} test predictions were imported:\n{formatted_estimators}")
        
        print("[INFO] Finished importing predictions")

    def clear_preds(self):
        """
        Removes all stored predictions by deleting the two files at filepaths specified
        by `working_dir_path`, `train_preds_filename` and `test_preds_filename`.
        """
        train_preds_file_path = self.get_train_preds_file_path()
        test_preds_file_path = self.get_test_preds_file_path()

        if train_preds_file_path.is_file():
            train_preds_file_path.unlink()
        if test_preds_file_path.is_file():
            test_preds_file_path.unlink()

        print("[INFO] Finished clearing predictions")

Next, we'll create a variable for storing the estimators (`StackingEstimator` instances) that we'll pass to the `StackingPredictionsRetriever` class for getting all the predictions from our base models.

In [22]:
estimators = []

# 6. Feature Sets

In [23]:
FEATURE_SET_1 = [
    'gender', 'course', 'study_hours',
    'class_attendance', 'internet_access', 'sleep_hours',
    'sleep_quality', 'study_method', 'facility_rating',
    'exam_difficulty', 'study_hours_sq', 'class_attendance_sq',
    'sleep_parabola', 'study_hours_log', 'study_hours_x_class_attendance',
    'study_hours_x_sleep_hours', 'study_hours_x_sleep_quality', 'study_hours_x_exam_difficulty',
    'study_hours_x_internet_access', 'sleep_hours_x_sleep_quality', 'class_attendance_x_exam_difficulty',
    'class_attendance_x_facility_rating', 'study_hours_per_class_attendance',
]

FEATURE_SET_2 = [
    # same features as FEATURE_SET_1
    'gender', 'course', 'study_hours',
    'class_attendance', 'internet_access', 'sleep_hours',
    'sleep_quality', 'study_method', 'facility_rating',
    'exam_difficulty', 'study_hours_sq', 'class_attendance_sq',
    'sleep_parabola', 'study_hours_log', 'study_hours_x_class_attendance',
    'study_hours_x_sleep_hours', 'study_hours_x_sleep_quality', 'study_hours_x_exam_difficulty',
    'study_hours_x_internet_access', 'sleep_hours_x_sleep_quality', 'class_attendance_x_exam_difficulty',
    'class_attendance_x_facility_rating', 'study_hours_per_class_attendance',
    # new generated features not included in FEATURE_SET_1
    'study_hours_vs_difficulty_mean', 'attendance_vs_sleep_mean', 'sleep_hours_vs_quality_mean',
    'resource_score', 'rest_index', 'total_advantage_index',
    'student_profile_cluster',
]

# 7. Base Model Hyperparameter Tuning

In [24]:
# to skip hyperparameter tuning when it's not needed; set to `False` to do the tuning
SKIP_BASE_MODEL_HYPERPARAMETER_TUNING = True

# value set for early stopping for base models that support it; this value will be used for actual model training as well
BASE_MODEL_EARLY_STOPPING_ROUNDS = 300

In [25]:
class BaseModelOptunaStudyEstimator(Enum):
    CATBOOSTREGRESSOR = 'CatBoostRegressor'
    LGBMREGRESSOR = 'LGBMRegressor'
    XGBREGRESSOR = 'XGBRegressor'

In [26]:
# estimator to use for Optuna study
BASE_MODEL_OPTUNA_STUDY_ESTIMATOR = BaseModelOptunaStudyEstimator.LGBMREGRESSOR

# feature set to use for Optuna study
BASE_MODEL_OPTUNA_STUDY_FEATURE_SET = FEATURE_SET_2

# maximum number of trials Optuna will conduct for the optimization
BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS = 200

# number of splits to use for K-Fold Cross-Validation for Optuna study
BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS = 3

In [27]:
def get_base_model_optuna_params(trial, study_estimator):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOSTREGRESSOR:
        if BASE_MODEL_OPTUNA_STUDY_FEATURE_SET == FEATURE_SET_1:
            return {
                'learning_rate': trial.suggest_float('learning_rate', 0.010, 0.015),
                'depth': trial.suggest_categorical('depth', [6]),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2.0, 4.0, log=True),
                'random_strength': trial.suggest_float('random_strength', 1.0, 2.5),
                'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.2, 0.3),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 85, 105),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 0.2),
            }
        else:
            raise ValueError(f"Search space for feature set for Optuna study not yet specified for CatBoostRegressor.")
    elif study_estimator == BaseModelOptunaStudyEstimator.LGBMREGRESSOR:
        if BASE_MODEL_OPTUNA_STUDY_FEATURE_SET == FEATURE_SET_1:
            return {
                'learning_rate': trial.suggest_float('learning_rate', 0.0045, 0.0065),
                'num_leaves': trial.suggest_int('num_leaves', 85, 105),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 15),
                'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
                'min_split_gain': trial.suggest_float('min_split_gain', 1e-3, 0.1, log=True),
                'subsample': trial.suggest_float('subsample', 0.85, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.15, 0.25),
                'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 5.0, 40.0, log=True),
                'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
                'path_smooth': trial.suggest_float('path_smooth', 0.0, 5.0),
            }
        elif BASE_MODEL_OPTUNA_STUDY_FEATURE_SET == FEATURE_SET_2:
            return {
                'learning_rate': trial.suggest_float('learning_rate', 0.0045, 0.0065),
                'num_leaves': trial.suggest_int('num_leaves', 70, 110),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 15),
                'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
                'min_split_gain': trial.suggest_float('min_split_gain', 1e-3, 0.1, log=True),
                'subsample': trial.suggest_float('subsample', 0.85, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.10, 0.25),
                'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 10.0, log=True),
                'reg_lambda': trial.suggest_float('reg_lambda', 5.0, 40.0, log=True),
                'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
                'path_smooth': trial.suggest_float('path_smooth', 0.0, 5.0),
            }
        else:
            raise ValueError(f"Search space for feature set for Optuna study not yet specified for LGBMRegressor.")
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBREGRESSOR:
        if BASE_MODEL_OPTUNA_STUDY_FEATURE_SET == FEATURE_SET_1:
            return {
                'learning_rate': trial.suggest_float('learning_rate', 0.010, 0.013),
                'max_depth': trial.suggest_categorical('max_depth', [6]),
                'subsample': trial.suggest_float('subsample', 0.95, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.22, 0.26),
                'alpha': trial.suggest_float('alpha', 2.0, 8.0, log=True),
                'lambda': trial.suggest_float('lambda', 1e-4, 0.1, log=True),
                'gamma': trial.suggest_float('gamma', 1e-4, 0.01, log=True),
                'min_child_weight': trial.suggest_int('min_child_weight', 30, 37),
            }
        else:
            raise ValueError(f"Search space for feature set for Optuna study not yet specified for XGBRegressor.")
    else:
        raise ValueError("Unsupported optuna study estimator")

def get_base_model_predictions(study_estimator, trial_params, X_train_fold, y_train_fold, X_validation_fold, y_validation_fold):
    if study_estimator == BaseModelOptunaStudyEstimator.CATBOOSTREGRESSOR:
        model = CatBoostRegressor(
            **trial_params,
            iterations=30000,
            use_best_model=True,
            cat_features=cat_features,
            bootstrap_type='Bayesian',
            loss_function='RMSE',
            eval_metric='RMSE',
            task_type='GPU' if torch.cuda.is_available() else 'CPU',
            devices='0',
            random_seed=RANDOM_SEEDS[0],
            verbose=False,
            allow_writing_files=False
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_validation_fold, y_validation_fold),
            early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS
        )
        return model.predict(X_validation_fold)
    elif study_estimator == BaseModelOptunaStudyEstimator.LGBMREGRESSOR:
        model = lgb.LGBMRegressor(
            **trial_params,
            n_estimators=30000,
            objective='regression',
            metric='rmse',
            bagging_freq=1,
            verbose=-1,
            n_jobs=-1,
            random_state=RANDOM_SEEDS[0]
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_validation_fold, y_validation_fold)],
            callbacks=[lgb.early_stopping(stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS, verbose=0)]
        )
        return model.predict(X_validation_fold)
    elif study_estimator == BaseModelOptunaStudyEstimator.XGBREGRESSOR:
        model = XGBRegressor(
            **trial_params,
            n_estimators=30000,
            tree_method='hist' if torch.cuda.is_available() else 'auto',
            device='cuda' if torch.cuda.is_available() else 'cpu',
            enable_categorical=True,
            objective='reg:squarederror',
            eval_metric='rmse',
            early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS,
            n_jobs=-1,
            random_state=RANDOM_SEEDS[0],
            verbosity=0
        )
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_validation_fold, y_validation_fold)],
            verbose=False
        )
        return model.predict(X_validation_fold)
    else:
        raise ValueError("Unsupported optuna study estimator")

def base_model_optuna_study_objective(trial):
    base_model_params = get_base_model_optuna_params(trial, BASE_MODEL_OPTUNA_STUDY_ESTIMATOR)

    X_train = train_data[BASE_MODEL_OPTUNA_STUDY_FEATURE_SET]
    y_train = train_data[target_col]

    base_model_optuna_study_kf = KFold(n_splits=BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS, shuffle=True, random_state=RANDOM_SEEDS[0])
    base_model_optuna_study_kf_splits = base_model_optuna_study_kf.split(X_train, y_train)
    base_model_optuna_study_kf_enumeration = enumerate(base_model_optuna_study_kf_splits)

    total_rmse = 0

    for fold, (train_indices, validation_indices) in base_model_optuna_study_kf_enumeration:
        X_train_fold = X_train.iloc[train_indices]
        X_validation_fold = X_train.iloc[validation_indices]
        y_train_fold = y_train.iloc[train_indices]
        y_validation_fold = y_train.iloc[validation_indices]

        y_validation_pred = get_base_model_predictions(
            BASE_MODEL_OPTUNA_STUDY_ESTIMATOR,
            base_model_params,
            X_train_fold, y_train_fold,
            X_validation_fold, y_validation_fold
        )

        y_validation_pred = np.clip(y_validation_pred, 0, 100)

        rmse_fold = np.sqrt(mean_squared_error(y_validation_fold, y_validation_pred))
        total_rmse += rmse_fold

        trial.report(rmse_fold, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    average_rmse = total_rmse / BASE_MODEL_OPTUNA_STUDY_KFOLD_NUM_SPLITS
    return average_rmse

In [28]:
if SKIP_BASE_MODEL_HYPERPARAMETER_TUNING:
    print("Skipped base model hyperparameter tuning")
else:
    print(f"Started base model hyperparameter tuning for {BASE_MODEL_OPTUNA_STUDY_ESTIMATOR.value}")
    sampler = optuna.samplers.TPESampler(n_ei_candidates=50, multivariate=True)
    study = optuna.create_study(sampler=sampler, direction='minimize', study_name='base_model_study')
    study.optimize(base_model_optuna_study_objective, n_trials=BASE_MODEL_OPTUNA_STUDY_NUM_TRIALS)
    
    print(f"# trials finished: {len(study.trials)}")
    trial = study.best_trial
    print(f"Best trial AUC: {trial.value}")
    print(f"Best trial params:")
    for param_key, param_value in trial.params.items():
        print(f"- {param_key}: {param_value}")

Skipped base model hyperparameter tuning


# 8. Base Model Candidates

The base models that are candidates for the ensemble are specified. Instead of only relying on the meta-model to filter out unhelpful base model predictions, Optuna studies will also be conducted to find the feature set (set of base model predictions) that should be used for the meta-model.

In [29]:
# number of splits to use for K-Fold Cross-Validation for base models
BASE_MODEL_KFOLD_NUM_SPLITS = 5

## 8.1 CatBoostRegressor

### 8.1.1 Helper Methods (CatBoostRegressor)

In [30]:
def get_catboostregressor_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data))
    test_preds_accumulator = np.zeros(len(test_data))

    X_train = train_data[feature_names]
    y_train = train_data[target_col]

    for random_seed in RANDOM_SEEDS:
        kf = KFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        kf_splits = kf.split(X_train, y_train)
        kf_enumeration = enumerate(kf_splits)

        seed_oof_preds = np.zeros(len(train_data))
        
        for fold, (train_indices, validation_indices) in kf_enumeration:
            X_train_fold = X_train.iloc[train_indices]
            X_validation_fold = X_train.iloc[validation_indices]
            y_train_fold = y_train.iloc[train_indices]
            y_validation_fold = y_train.iloc[validation_indices]

            model = CatBoostRegressor(
                **params_dict,
                iterations=30000,
                use_best_model=True,
                cat_features=cat_features,
                bootstrap_type='Bayesian',
                loss_function='RMSE',
                eval_metric='RMSE',
                task_type='GPU' if torch.cuda.is_available() else 'CPU',
                devices='0',
                random_seed=random_seed,
                verbose=False,
                allow_writing_files=False
            )
            
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=(X_validation_fold, y_validation_fold),
                early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS
            )

            y_validation_pred = model.predict(X_validation_fold)
            y_test_pred = model.predict(test_data[feature_names])

            y_validation_pred = np.clip(y_validation_pred, 0, 100)
            y_test_pred = np.clip(y_test_pred, 0, 100)

            seed_oof_preds[validation_indices] = y_validation_pred
            test_preds_accumulator += y_test_pred

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_catboostregressor_stacking_estimator(index, params_dict, feature_names):
    return StackingEstimator(
        name=f"CatBoostRegressor_{index}",
        params_dict=params_dict,
        feature_names=feature_names,
        get_preds=get_catboostregressor_preds
    )

### 8.1.2 Add Estimators (CatBoostRegressor)

Add CatBoostRegressor estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [31]:
# CatBoostRegressor base models using FEATURE_SET_1
estimators += [
    #   get_catboostregressor_stacking_estimator(
    #     index=1,
    #     params_dict={ # Optuna study RMSE: 8.732405143185524
    #         'learning_rate': 0.012512877480828674,
    #         'depth': 6,
    #         'l2_leaf_reg': 2.785033525504829,
    #         'random_strength': 1.9972412072333539,
    #         'colsample_bylevel': 0.2578536380324921,
    #         'min_data_in_leaf': 93,
    #         'bagging_temperature': 0.1409287678742852,
    #     },
    #     feature_names=FEATURE_SET_1
    # ),
]

## 8.2 LGBMRegressor

### 8.2.1 Helper Methods (LGBMRegressor)

In [32]:
def get_lgbmregressor_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data))
    test_preds_accumulator = np.zeros(len(test_data))

    X_train = train_data[feature_names]
    y_train = train_data[target_col]

    for random_seed in RANDOM_SEEDS:
        kf = KFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        kf_splits = kf.split(X_train, y_train)
        kf_enumeration = enumerate(kf_splits)

        seed_oof_preds = np.zeros(len(train_data))

        for fold, (train_indices, validation_indices) in kf_enumeration:
            X_train_fold = X_train.iloc[train_indices]
            X_validation_fold = X_train.iloc[validation_indices]
            y_train_fold = y_train.iloc[train_indices]
            y_validation_fold = y_train.iloc[validation_indices]

            model = lgb.LGBMRegressor(
                **params_dict,
                n_estimators=30000,
                objective='regression',
                metric='rmse',
                bagging_freq=1,
                verbose=-1,
                n_jobs=-1,
                random_state=random_seed
            )
            
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_validation_fold, y_validation_fold)],
                callbacks=[lgb.early_stopping(stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS, verbose=0)]
            )

            y_validation_pred = model.predict(X_validation_fold)
            y_test_pred = model.predict(test_data[feature_names])

            y_validation_pred = np.clip(y_validation_pred, 0, 100)
            y_test_pred = np.clip(y_test_pred, 0, 100)

            seed_oof_preds[validation_indices] = y_validation_pred
            test_preds_accumulator += y_test_pred

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_lgbmregressor_stacking_estimator(index, params_dict, feature_names):
    return StackingEstimator(
        name=f"LGBMRegressor_{index}",
        params_dict=params_dict,
        feature_names=feature_names,
        get_preds=get_lgbmregressor_preds
    )

### 8.2.2 Add Estimators (LGBMRegressor)

Add LGBMRegressor estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [33]:
# LGBMRegressor base models using FEATURE_SET_1
estimators += [
    get_lgbmregressor_stacking_estimator(
        index=1,
        params_dict={ # Optuna study RMSE: 8.731615849391675
            'learning_rate': 0.00560062116243721,
            'num_leaves': 56,
            'min_child_samples': 26,
            'min_split_gain': 0.04953245371789207,
            'subsample': 0.8713341159266043,
            'colsample_bytree': 0.1954098975745725,
            'reg_alpha': 25.4080280093216,
            'reg_lambda': 7.3219797767089885,
        },
        feature_names=FEATURE_SET_1
    ),
    get_lgbmregressor_stacking_estimator(
        index=2,
        params_dict={ # Optuna study RMSE: 8.731273044954213
            'learning_rate': 0.004361721228499642,
            'num_leaves': 100,
            'min_child_samples': 12,
            'min_split_gain': 0.07298489238351248,
            'subsample': 0.8791520668621439,
            'colsample_bytree': 0.18986486739419273,
            'reg_alpha': 0.10748714887376254,
            'reg_lambda': 10.19912390767223,
        },
        feature_names=FEATURE_SET_1
    ),
]

## 8.3 XGBRegressor

### 8.3.1 Helper Methods (XGBRegressor)

In [34]:
def get_xgbregressor_preds(params_dict, feature_names):
    oof_preds_accumulator = np.zeros(len(train_data))
    test_preds_accumulator = np.zeros(len(test_data))

    X_train = train_data[feature_names]
    y_train = train_data[target_col]

    for random_seed in RANDOM_SEEDS:
        kf = KFold(n_splits=BASE_MODEL_KFOLD_NUM_SPLITS, shuffle=True, random_state=random_seed)
        kf_splits = kf.split(X_train, y_train)
        kf_enumeration = enumerate(kf_splits)

        seed_oof_preds = np.zeros(len(train_data))

        for fold, (train_indices, validation_indices) in kf_enumeration:
            X_train_fold = X_train.iloc[train_indices]
            X_validation_fold = X_train.iloc[validation_indices]
            y_train_fold = y_train.iloc[train_indices]
            y_validation_fold = y_train.iloc[validation_indices]

            model = XGBRegressor(
                **params_dict,
                n_estimators=30000,
                tree_method='hist' if torch.cuda.is_available() else 'auto',
                device='cuda' if torch.cuda.is_available() else 'cpu',
                enable_categorical=True,
                objective='reg:squarederror',
                eval_metric='rmse',
                early_stopping_rounds=BASE_MODEL_EARLY_STOPPING_ROUNDS,
                n_jobs=-1,
                random_state=random_seed,
                verbosity=0
            )
            
            model.fit(
                X_train_fold, y_train_fold,
                eval_set=[(X_validation_fold, y_validation_fold)],
                verbose=False
            )

            y_validation_pred = model.predict(X_validation_fold)
            y_test_pred = model.predict(test_data[feature_names])

            y_validation_pred = np.clip(y_validation_pred, 0, 100)
            y_test_pred = np.clip(y_test_pred, 0, 100)

            seed_oof_preds[validation_indices] = y_validation_pred
            test_preds_accumulator += y_test_pred

        oof_preds_accumulator += seed_oof_preds

    final_oof_preds = oof_preds_accumulator / len(RANDOM_SEEDS)
    final_test_preds = test_preds_accumulator / (BASE_MODEL_KFOLD_NUM_SPLITS * len(RANDOM_SEEDS))
    
    return pd.Series(final_oof_preds), pd.Series(final_test_preds)

def get_xgbregressor_stacking_estimator(index, params_dict, feature_names):
    return StackingEstimator(
        name=f"XGBRegressor_{index}",
        params_dict=params_dict,
        feature_names=feature_names,
        get_preds=get_xgbregressor_preds
    )

### 8.3.2 Add Estimators (XGBRegressor)

Add XGBRegressor estimators to list that StackingPredictionsRetriever will process. Hyperparameters were found using Optuna.

In [35]:
# XGBRegressor base models using FEATURE_SET_1
estimators += [
    get_xgbregressor_stacking_estimator(
        index=1,
        params_dict={ # Optuna study RMSE: 8.726746380604618
            'learning_rate': 0.013587597098853266,
            'max_depth': 6,
            'subsample': 0.9572735014614386,
            'colsample_bytree': 0.23913914702804767,
            'alpha': 1.4405674568559026,
            'lambda': 0.039839278291000785,
            'gamma': 0.041457934625352785,
            'min_child_weight': 25,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=2,
        params_dict={ # Optuna study RMSE: 8.72559295676817
            'learning_rate': 0.011042630421555327,
            'max_depth': 6,
            'subsample': 0.9599956732090699,
            'colsample_bytree': 0.25445172888708584,
            'alpha': 0.6887980139572322,
            'lambda': 0.05097488457198369,
            'gamma': 0.07390409174758057,
            'min_child_weight': 38,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=3,
        params_dict={ # Optuna study RMSE: 8.725078961771361
            'learning_rate': 0.008078244339287369,
            'max_depth': 6,
            'subsample': 0.9163689142226277,
            'colsample_bytree': 0.2296982964930553,
            'alpha': 0.19395772739611541,
            'lambda': 0.04919117142485752,
            'gamma': 0.16665026636261474,
            'min_child_weight': 43,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=4,
        params_dict={ # Optuna study RMSE: 8.725332819931655
            'learning_rate': 0.009581863710458615,
            'max_depth': 6,
            'subsample': 0.9731807527392785,
            'colsample_bytree': 0.22833370318746365,
            'alpha': 0.48389973296874966,
            'lambda': 0.7147942393033277,
            'gamma': 0.020976364139759063,
            'min_child_weight': 38,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=5,
        params_dict={ # Optuna study RMSE: 8.726063500294797
            'learning_rate': 0.01372187453137857,
            'max_depth': 6,
            'subsample': 0.977043107173594,
            'colsample_bytree': 0.23095579097825794,
            'alpha': 2.551583903037864,
            'lambda': 0.13885611044124127,
            'gamma': 0.00021296247007217191,
            'min_child_weight': 33,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=6,
        params_dict={ # Optuna study RMSE: 8.725542960628767
            'learning_rate': 0.010016586025401255,
            'max_depth': 6,
            'subsample': 0.9522750478052623,
            'colsample_bytree': 0.23393985284449337,
            'alpha': 2.1874877683283414,
            'lambda': 0.025727886413125782,
            'gamma': 0.0007752478566528677,
            'min_child_weight': 32,
        },
        feature_names=FEATURE_SET_1
    ),
    get_xgbregressor_stacking_estimator(
        index=7,
        params_dict={ # Optuna study RMSE: 8.72542385443002
            'learning_rate': 0.011352870555104531,
            'max_depth': 6,
            'subsample': 0.9590160606833864,
            'colsample_bytree': 0.23374922886341426,
            'alpha': 4.446672845051681,
            'lambda': 0.03219358347958651,
            'gamma': 0.00015288138932193678,
            'min_child_weight': 35,
        },
        feature_names=FEATURE_SET_1
    ),
]

## 8.4 Number of Base Models

In [36]:
print(f"Total number of base models: {len(estimators)}")

Total number of base models: 9


# 9. Base Model Predictions

## 9.1 Get Base Model Predictions

In [37]:
stacking_preds_retriever = StackingPredictionsRetriever(
    estimators=estimators,
    working_dir_path="/kaggle/working/",
    train_preds_filename="base_models_train_preds",
    test_preds_filename="base_models_test_preds",
    preds_save_interval=1
)
stacking_preds_retriever.clear_preds()
stacking_preds_retriever.import_preds("/kaggle/input/predicting-student-test-scores-base-model-preds/")
stacking_preds_retriever.get_preds()

base_model_train_preds, base_model_test_preds = stacking_preds_retriever.get_current_train_and_test_preds()
base_model_train_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))
base_model_test_preds.sort_index(axis=1, inplace=True, key=lambda index: index.map(lambda col_name: (col_name.split("_")[0], int(col_name.split()[0].split("_")[-1]))))

[INFO] Finished clearing predictions
[INFO] Importing predictions..
[INFO] 9 train predictions were imported:
LGBMRegressor_1 (bc61f24c868323d36835d4301ce2690f), LGBMRegressor_2 (7edbc68d7e91bf012f62e14a3c8ce24a), XGBRegressor_1 (d5b33f920572395ea591f85395372949), XGBRegressor_2 (1651e6cc0bc873c5f37ae6eebceb2002), XGBRegressor_3 (7c033486efe2592e85d22287e98391fd), XGBRegressor_4 (7afd382667da316172eb1fc14749b8b8), XGBRegressor_5 (2ed31731d23c20a3feaad9d83527263c), XGBRegressor_6 (b7ab8f03fb6215815e39cc574693db7b), XGBRegressor_7 (f3a7f891ae0b2a9289432074da2d0433)
[INFO] 9 test predictions were imported:
LGBMRegressor_1 (bc61f24c868323d36835d4301ce2690f), LGBMRegressor_2 (7edbc68d7e91bf012f62e14a3c8ce24a), XGBRegressor_1 (d5b33f920572395ea591f85395372949), XGBRegressor_2 (1651e6cc0bc873c5f37ae6eebceb2002), XGBRegressor_3 (7c033486efe2592e85d22287e98391fd), XGBRegressor_4 (7afd382667da316172eb1fc14749b8b8), XGBRegressor_5 (2ed31731d23c20a3feaad9d83527263c), XGBRegressor_6 (b7ab8f03fb6215

## 9.2 Base Models RMSE

In [38]:
base_model_rmse = pd.Series()
for estimator in base_model_train_preds.columns:
    base_model_rmse[estimator] = np.sqrt(mean_squared_error(train_data[target_col], base_model_train_preds[estimator]))
base_model_rmse.sort_values()

XGBRegressor_2 (1651e6cc0bc873c5f37ae6eebceb2002)     8.706541
XGBRegressor_5 (2ed31731d23c20a3feaad9d83527263c)     8.706560
XGBRegressor_4 (7afd382667da316172eb1fc14749b8b8)     8.706637
XGBRegressor_7 (f3a7f891ae0b2a9289432074da2d0433)     8.706706
XGBRegressor_1 (d5b33f920572395ea591f85395372949)     8.707001
XGBRegressor_6 (b7ab8f03fb6215815e39cc574693db7b)     8.707044
XGBRegressor_3 (7c033486efe2592e85d22287e98391fd)     8.707424
LGBMRegressor_2 (7edbc68d7e91bf012f62e14a3c8ce24a)    8.715358
LGBMRegressor_1 (bc61f24c868323d36835d4301ce2690f)    8.715844
dtype: float64

# 10. Submission (Baseline)

In [39]:
# prepare submission
submission = pd.DataFrame({'id': test_data.index, target_col: base_model_test_preds['XGBRegressor_7 (f3a7f891ae0b2a9289432074da2d0433)']})
submission.to_csv('submission.csv', index=False)
print('Submission file prepared.')

Submission file prepared.
