In [1]:
import numpy as np
import polars as pl
import pandas as pd

import lightgbm as lgb
import optuna

from feature_engine.selection import SelectBySingleFeaturePerformance

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import time
import math
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
# load cleaned datasets
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

In [3]:
# transform to pandas
dataset_x = dataset.drop(['target']).to_pandas()
dataset_y = dataset.get_column('target').to_pandas()
newdata = sbmssn.drop('child_id').to_pandas()

In [4]:
# Split data into train-valid-test set
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(dataset_x, dataset_y, test_size=1 - train_ratio, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 

In [5]:
# Define the categorical features
num_features = dataset_x.select_dtypes(exclude="category").columns
cat_features = list(dataset_x.select_dtypes(include="category").columns.values)

In [6]:
# get lightgbm data format
lgb_train = lgb.Dataset(data=x_train, label=y_train)
lgb_val = lgb.Dataset(data=x_val, label=y_val, reference=lgb_train)

In [7]:
# optuna hyper-parameter optimization with TPE sampler
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'learning_rate': 0.005, 
        'lambda_l1': 0, #trial.suggest_float('lambda_l1', 1e-4, 10, log=True),
        'lambda_l2': 0, #trial.suggest_float('lambda_l2', 1e-4, 10, log=True),
        'num_leaves': 32, 
        'min_data_in_leaf': 20,
        'feature_fraction_bynode': 1, #trial.suggest_float('feature_fraction_bynode', 0.5, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': 1, #trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'cat_l2': trial.suggest_float('cat_l2', 5, 20),
        'cat_smooth': trial.suggest_float('cat_smooth', 5, 15),
        'verbosity': -1,
        'seed': 0
    }

    bst = lgb.train(
        params=params,
        num_boost_round=5000,
        train_set=lgb_train,
        valid_sets=lgb_val,
        callbacks=[lgb.early_stopping(stopping_rounds=5, verbose=False)],
    )

    y_test_pred = bst.predict(data=x_test)
    rmse = mean_squared_error(y_test, y_test_pred, squared=False)

    return rmse


optuna.logging.set_verbosity(optuna.logging.INFO)

study_bo = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=0),
    direction='minimize'
)

study_bo.optimize(objective, n_trials=None, timeout=60*45, n_jobs=1, show_progress_bar=False)

print('Number of finished trials:', len(study_bo.trials))
print('Best trial:', study_bo.best_trial.params)

[32m[I 2023-04-30 21:24:59,772][0m A new study created in memory with name: no-name-a1992050-3328-473d-bc7a-cdaefe968ae5[0m
[32m[I 2023-04-30 21:25:10,106][0m Trial 0 finished with value: 9.515610130422626 and parameters: {'feature_fraction': 0.7744067519636624, 'cat_l2': 15.727840495586292, 'cat_smooth': 11.027633760716439}. Best is trial 0 with value: 9.515610130422626.[0m
[32m[I 2023-04-30 21:25:24,156][0m Trial 1 finished with value: 9.524179986289083 and parameters: {'feature_fraction': 0.7724415914984484, 'cat_l2': 11.354821990083572, 'cat_smooth': 11.458941130666561}. Best is trial 0 with value: 9.515610130422626.[0m
[32m[I 2023-04-30 21:25:36,571][0m Trial 2 finished with value: 9.525116971193915 and parameters: {'feature_fraction': 0.7187936056313462, 'cat_l2': 18.376595011731197, 'cat_smooth': 14.636627605010293}. Best is trial 0 with value: 9.515610130422626.[0m
[32m[I 2023-04-30 21:25:47,802][0m Trial 3 finished with value: 9.499130436144508 and parameters: {'

Number of finished trials: 194
Best trial: {'feature_fraction': 0.7274049492997424, 'cat_l2': 7.338696749456784, 'cat_smooth': 9.793892656173774}


In [12]:
print('Config: ', study_bo.best_params, end='\n\n')

Config:  {'feature_fraction': 0.7274049492997424, 'cat_l2': 7.338696749456784, 'cat_smooth': 9.793892656173774}



In [9]:
fig = optuna.visualization.plot_optimization_history(study_bo)
fig.show()

In [10]:
fig = optuna.visualization.plot_contour(study_bo)
fig.show()

In [11]:
# save and load model
# bst.save_model('model.txt')
# bst = lgb.Booster(model_file='model.txt')  # init model