In [4]:
import numpy as np
import polars as pl

import lightgbm as lgb
import optuna

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import time
import math
import warnings
warnings.filterwarnings('ignore')

In [5]:
# load cleaned datasets
dataset = pl.read_parquet("../data/clean/dataset.parquet")
sbmssn = pl.read_parquet("../data/clean/sbmssn.parquet")

In [12]:
dataset.drop(['target']).to_pandas().dtypes

data_year                  float64
child_date                 float64
child_age                  float64
child_enrolment_date       float64
child_months_enrolment     float64
                            ...   
health_2                  category
health_3                  category
health_4                  category
health_5                  category
health_97                 category
Length: 700, dtype: object

In [8]:
# training - test split
train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(
    dataset.drop(['target']).to_pandas(),
    dataset.get_column('target').to_pandas(),
    train_size=0.9,
    random_state=0
)

# training is splitted for tuning again
eval_data_x, valid_data_x, eval_data_y, valid_data_y = train_test_split(
    train_data_x,
    train_data_y,
    train_size=0.8,
    random_state=0
)

# lgbm data format
lgb_data_eval = lgb.Dataset(data=eval_data_x, label=eval_data_y)
lgb_data_valid = lgb.Dataset(data=valid_data_x, label=valid_data_y, reference=lgb_data_eval)

lgb_data_train = lgb.Dataset(data=train_data_x, label=train_data_y)
lgb_data_test = lgb.Dataset(data=test_data_x, label=test_data_y, reference=lgb_data_eval)

In [9]:
# optuna hyper-parameter optimization with TPE sampler
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting': 'gbdt',
        'learning_rate': 0.01,
        # 'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-4, 100.0),
        # 'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 100.0),
        # 'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        # 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
        # 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
        'cat_l2': trial.suggest_uniform('cat_l2', 5, 15),
        'cat_smooth': trial.suggest_uniform('cat_smooth', 5, 15),
        'verbosity': -1,
        'seed': 0
    }

    # FIXME: should I use here separate validation sets for early stopping and generalization estimate?
    gbm = lgb.train(
        params=params,
        num_boost_round=5000,
        train_set=lgb_data_eval,
        valid_sets=[lgb_data_valid],
        callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)],
    )

    valid_data_pred = gbm.predict(data=valid_data_x)
    rmse = mean_squared_error(valid_data_y, valid_data_pred, squared=False)

    return rmse


optuna.logging.set_verbosity(optuna.logging.INFO)

study_bo = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=0),
    direction='minimize'
)
study_bo.optimize(objective, n_trials=100, n_jobs=4, show_progress_bar=True)

print('Number of finished trials:', len(study_bo.trials))
print('Best trial:', study_bo.best_trial.params)

[32m[I 2023-04-23 09:06:57,555][0m A new study created in memory with name: no-name-9d46a14d-0f4f-425c-a47d-ae5aaec8bb82[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[33m[W 2023-04-23 09:06:59,884][0m Trial 2 failed with parameters: {'cat_l2': 9.474948771293828, 'cat_smooth': 8.561186147719306} because of the following error: OSError('exception: access violation reading 0x0000000000000128').[0m
Traceback (most recent call last):
  File "c:\Users\Daniel\Documents\GitHub\zindi-early-learning-predictors-py\.venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Daniel\AppData\Local\Temp\ipykernel_7312\308302607.py", line 20, in objective
    gbm = lgb.train(
  File "c:\Users\Daniel\Documents\GitHub\zindi-early-learning-predictors-py\.venv\lib\site-packages\lightgbm\engine.py", line 271, in train
    booster = Booster(params=params, train_set=train_set)
  File "c:\Users\Daniel\Documents\GitHub\zindi-early-learning-predictors-py\.venv\lib\site-packages\lightgbm\basic.py", line 2605, in __init__
    train_set.construct()
  File "c:\Users\Daniel\Documents\GitHub\zindi-early-learning

OSError: exception: access violation reading 0x0000000000000128

In [19]:
# refit model on complete training dataset and check prediction error
lgb_dataset = lgb.Dataset(
    data=train_data_x,
    label=train_data_y
)

bst = lgb.train(
    params=study_bo.best_params, 
    train_set=lgb_dataset
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13306
[LightGBM] [Info] Number of data points in the train set: 7726, number of used features: 698
[LightGBM] [Info] Start training from score 48.700467


In [20]:
test_data_preds = bst.predict(data=test_data_x)
test_rmse = mean_squared_error(test_data_y, test_data_preds, squared=False)

print('Score', round(test_rmse, ndigits=6))

Score 9.731916


In [None]:
# fit a final model to complete training set and predict submission data


In [None]:
# save and load model
# bst.save_model('model.txt')
# bst = lgb.Booster(model_file='model.txt')  # init model