In [1]:
import numpy as np
import polars as pl
import pandas

import lightgbm as lgb
import optuna
import shap

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import time
import math
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
# load cleaned datasets
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

dataset.shape

(8585, 701)

In [3]:
# training - test split
train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(
    dataset.drop(['target']).to_pandas(),
    dataset.get_column('target').to_pandas(),
    train_size=0.9,
    random_state=0
)

# training is splitted for tuning again
eval_data_x, valid_data_x, eval_data_y, valid_data_y = train_test_split(
    train_data_x,
    train_data_y,
    train_size=0.80,
    random_state=0
)

# lgbm data format
lgb_data_eval = lgb.Dataset(data=eval_data_x, label=eval_data_y)
lgb_data_valid = lgb.Dataset(data=valid_data_x, label=valid_data_y, reference=lgb_data_eval)

lgb_data_train = lgb.Dataset(data=train_data_x, label=train_data_y)
lgb_data_test = lgb.Dataset(data=test_data_x, label=test_data_y, reference=lgb_data_train)

In [4]:
# optuna hyper-parameter optimization with TPE sampler
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'learning_rate': 0.005, #trial.suggest_float('learning_rate', 1e-4, 1, log=True),
        'num_iterations': trial.suggest_int('num_iterations', 500, 5000),
        # 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-4, 100.0),
        # 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 100.0),
        'num_leaves': 39, #trial.suggest_int('num_leaves', 16, 64),
        'min_data_in_leaf': 20,
        'feature_fraction': 0.6, #trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': 1, #trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'cat_l2': trial.suggest_float('cat_l2', 10, 15),
        'cat_smooth': trial.suggest_float('cat_smooth', 10, 15),
        'verbosity': -1,
        'seed': 0
    }

    # FIXME: should I use here separate validation sets for early stopping and generalization estimate?
    gbm = lgb.train(
        params=params,
        train_set=lgb_data_eval,
    )

    valid_data_pred = gbm.predict(data=valid_data_x)
    rmse = mean_squared_error(valid_data_y, valid_data_pred, squared=False)

    return rmse


optuna.logging.set_verbosity(optuna.logging.INFO)

study_bo = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=0),
    direction='minimize'
)
study_bo.optimize(objective, n_trials=100, show_progress_bar=False)

print('Number of finished trials:', len(study_bo.trials))
print('Best trial:', study_bo.best_trial.params)

[32m[I 2023-04-23 15:57:29,269][0m A new study created in memory with name: no-name-b30effc8-03e4-4822-8842-3b79bad8d6ac[0m
[32m[I 2023-04-23 15:57:59,589][0m Trial 0 finished with value: 9.382480193401314 and parameters: {'num_iterations': 2970, 'cat_l2': 13.575946831862097, 'cat_smooth': 13.01381688035822}. Best is trial 0 with value: 9.382480193401314.[0m
[32m[I 2023-04-23 15:58:40,743][0m Trial 1 finished with value: 9.38340048706442 and parameters: {'num_iterations': 2952, 'cat_l2': 12.118273996694523, 'cat_smooth': 13.22947056533328}. Best is trial 0 with value: 9.382480193401314.[0m
[32m[I 2023-04-23 15:59:19,049][0m Trial 2 finished with value: 9.379280284977224 and parameters: {'num_iterations': 2469, 'cat_l2': 14.458865003910399, 'cat_smooth': 14.818313802505147}. Best is trial 2 with value: 9.379280284977224.[0m
[32m[I 2023-04-23 15:59:52,059][0m Trial 3 finished with value: 9.378925625371592 and parameters: {'num_iterations': 2225, 'cat_l2': 13.958625190413322

Number of finished trials: 100
Best trial: {'num_iterations': 2510, 'cat_l2': 14.4311821240945, 'cat_smooth': 12.033524974358492}


In [None]:
print('Config: ', study_bo.best_params, end='\n\n')

Config:  {'num_iterations': 2510, 'cat_l2': 14.4311821240945, 'cat_smooth': 12.033524974358492}



In [None]:
fig = optuna.visualization.plot_optimization_history(study_bo)
fig.show()

In [7]:
fig = optuna.visualization.plot_contour(study_bo)
fig.show()

In [8]:
# refit model on complete training dataset and check prediction error
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'dart',
    'learning_rate': 0.005,
    'num_leaves': 39,
    'feature_fraction': 0.6,
    'bagging_fraction': 1,
    'cat_l2': 12.6,
    'cat_smooth': 13.7,
    'verbosity': -1,
    'seed': 0
}

# params.update(study_bo.best_params)
params.update({'num_iterations': 2000})

cv = lgb.cv(
    params=params,
    train_set=lgb_data_train,
    nfold=10,
    seed=0,
    eval_train_metric=True,
    callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)],
    stratified=False
)

In [9]:
# check results
params.update({'num_iterations': np.argmin(cv['valid rmse-mean'])})

len(cv['valid rmse-mean'])
np.argmin(cv['valid rmse-mean'])

1999

In [10]:
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'num_iterations': 2000,
    'learning_rate': 0.005,
    'num_leaves': 39,
    'feature_fraction': 0.6,
    'bagging_fraction': 1,
    'cat_l2': 14.4,
    'cat_smooth': 12,
    'verbosity': -1,
    'seed': 0
}

# params.update(study_bo.best_params)
params.update({})

bst = lgb.train(
    params=params,
    train_set=lgb_data_train,
)

In [11]:
test_data_pred = bst.predict(data=test_data_x)
test_rmse = mean_squared_error(test_data_y, test_data_pred, squared=False)

print('Score', round(test_rmse, ndigits=6))

Score 13.271949


In [12]:
# fit a final model to complete training set and predict submission data


In [13]:
# save and load model
# bst.save_model('model.txt')
# bst = lgb.Booster(model_file='model.txt')  # init model

In [14]:
# explain final model
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(train_data_x)

In [16]:
# visualize all the training set predictions
shap.plots.force(explainer.expected_value, shap_values)

In [None]:
# summarize the effects of all the features
shap.plots.beeswarm(shap_values)

ValueError: matplotlib is not installed so plotting is not available! Run `pip install matplotlib` to fix this.

In [None]:
# create final 
chllng['child_id'] = sbmssn['child_id']
chllng['child_id'] = bst.predict(sbmssn)