Strategy: Gradient Boosted Trees, specifically 'LightGBM'.

G. Ke et al., ‘LightGBM: A Highly Efficient Gradient Boosting Decision Tree’, in Advances in Neural Information Processing Systems, Curran Associates, Inc., 2017. [Online]. Available: https://proceedings.neurips.cc/paper_files/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf

In [14]:
import numpy as np
import polars as pl
import pandas as pd

import lightgbm as lgb
import optuna
import shap

import category_encoders as ce
from feature_engine.encoding import RareLabelEncoder

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import time
import math
import warnings

warnings.filterwarnings(action='ignore')

In [15]:
# load cleaned datasets
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

dataset = dataset.to_pandas()
sbmssn = sbmssn.to_pandas()

In [16]:
# training - test split
train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(
    dataset.drop(['target'], axis=1),
    dataset['target'],
    train_size=0.9,
    random_state=0
)

# training is splitted for tuning again
eval_data_x, valid_data_x, eval_data_y, valid_data_y = train_test_split(
    train_data_x,
    train_data_y,
    train_size=0.9,
    random_state=0
)

# lgbm data format
lgb_data_eval = lgb.Dataset(data=eval_data_x, label=eval_data_y)
lgb_data_valid = lgb.Dataset(data=valid_data_x, label=valid_data_y, reference=lgb_data_eval)

In [17]:
# optuna hyper-parameter optimization with TPE sampler
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'learning_rate': 0.01, # trial.suggest_float('learning_rate', 1e-4, 5e-1, log=True),
        'lambda_l1': 0, #trial.suggest_float('lambda_l1', 1e-4, 10, log=True),
        'lambda_l2': 0, #trial.suggest_float('lambda_l2', 1e-4, 10, log=True),
        'max_depth': trial.suggest_int('num_leaves', 5, 20),
        'num_leaves': 64, #trial.suggest_int('num_leaves', 16, 64),
        'min_data_in_leaf': 20,
        'min_gain_to_split': 0, 
        'feature_fraction_bynode': 0.6, #trial.suggest_float('feature_fraction_bynode', 0.6, 1.0),
        'feature_fraction': 1, #trial.suggest_float('feature_fraction_bynode', 0.6, 1.0),
        'bagging_fraction': 1, #trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'cat_l2': trial.suggest_float('cat_l2', 5, 15),
        'cat_smooth': 10, #trial.suggest_float('cat_smooth', 5, 15),
        'verbosity': -1,
        'seed': 0
    }

    cv = lgb.cv(
        params=params,
        num_boost_round=5000,
        train_set=lgb_data_eval,
        nfold=5,
        seed=0,
        eval_train_metric=True,
        stratified=False
    )

    optimal_rounds = np.argmin(cv['valid rmse-mean'])
    best_cv_score = min(cv['valid rmse-mean'])
    
    #valid_data_pred = gbm.predict(data=valid_data_x)
    #rmse = mean_squared_error(valid_data_y, valid_data_pred, squared=False)

    return best_cv_score


optuna.logging.set_verbosity(optuna.logging.INFO)

study_bo = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=0),
    direction='minimize'
)

study_bo.optimize(objective, n_trials=None, timeout=60*60*6, n_jobs=4, show_progress_bar=False)

print('Number of finished trials:', len(study_bo.trials))
print('Best trial:', study_bo.best_trial.params)

[32m[I 2023-04-26 22:35:45,104][0m A new study created in memory with name: no-name-6233170b-bf9c-412d-8ced-a258431a39b1[0m


In [None]:
print('Config: ', study_bo.best_params, end='\n\n')

Config:  {'feature_fraction_bynode': 0.6053645356352836, 'cat_smooth': 6.8681484572959155}



In [None]:
fig = optuna.visualization.plot_optimization_history(study_bo)
fig.show()

In [None]:
fig = optuna.visualization.plot_contour(study_bo)
fig.show()

In [None]:
# data for larger training data
lgb_data_train = lgb.Dataset(data=train_data_x, label=train_data_y)

In [None]:
# refit model on complete training dataset and check prediction error
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.005,
    'num_leaves': 40,
    'feature_fraction': 0.6,
    'bagging_fraction': 1,
    'cat_l2': 10,
    'cat_smooth': 10,
    'verbosity': -1,
    'seed': 0
}

#params.update(study_bo.best_params)
params.update({'num_iterations': 10000})

cv = lgb.cv(
    params=params,
    train_set=lgb_data_train,
    nfold=5,
    seed=0,
    eval_train_metric=True,
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)],
    stratified=False
)

In [None]:
# check results
len(cv['valid rmse-mean'])
np.argmin(cv['valid rmse-mean'])

1461

In [None]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.005,
    'num_leaves': 40,
    'feature_fraction': 0.6,
    'bagging_fraction': 1,
    'cat_l2': 10,
    'cat_smooth': 10,
    'verbosity': -1,
    'seed': 0
}

params.update({'num_iterations': np.argmin(cv['valid rmse-mean'])})

bst = lgb.train(
    params=params,
    train_set=lgb_data_train,
)

In [None]:
test_data_pred = bst.predict(data=test_data_x)
test_rmse = mean_squared_error(test_data_y, test_data_pred, squared=False)

print('Score', round(test_rmse, ndigits=6))

Score 9.423405


In [None]:
# save and load model
# bst.save_model('model.txt')
# bst = lgb.Booster(model_file='model.txt')  # init model