In [1]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import root_mean_squared_error
import optuna
import numpy as np
from dataset import Dataset

seed = 1234

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = Dataset()
df_train, df_test = train_test_split(ds.df, test_size=.15, shuffle=True, random_state=seed)
ds_train = Dataset(df=df_train)
ds_test = Dataset(df=df_test)

In [10]:
# Data
X, y = ds_train()
X_test, y_test = ds_test()

kf = KFold(n_splits=5, shuffle=True, random_state=seed)

In [3]:
def objective(trial):
    fixed_max_iter = 1000

    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_leaf_nodes": trial.suggest_categorical("max_leaf_nodes", [15, 31, 63, 127]),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "l2_regularization": trial.suggest_float("l2_regularization", 0.0, 10.0),
        "early_stopping": True,
        "validation_fraction": 0.1,  # Let sklearn split off 10% internally
        "n_iter_no_change": 10,
        "max_iter": fixed_max_iter,
        "categorical_features": ds_train.categorical_features
    }

    model = MultiOutputRegressor(HistGradientBoostingRegressor(**params))

    cv_rmse = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)  # internal early stopping on 10% of X_train
        preds = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, preds)
        cv_rmse.append(rmse)

    return np.mean(cv_rmse)

In [6]:
# Run Optuna
study = optuna.create_study(direction="minimize", study_name='K-fold')
study.optimize(objective, n_trials=50, timeout=1800)

# Output results
print("Best RMSE:", study.best_value)
print("Best hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

[I 2025-04-09 04:37:17,971] A new study created in memory with name: K-fold


[I 2025-04-09 04:37:28,565] Trial 0 finished with value: 15.148658769842806 and parameters: {'learning_rate': 0.14809263954639715, 'max_leaf_nodes': 31, 'min_samples_leaf': 2, 'l2_regularization': 8.237805262151138}. Best is trial 0 with value: 15.148658769842806.
[I 2025-04-09 04:37:36,396] Trial 1 finished with value: 15.188690693617867 and parameters: {'learning_rate': 0.12637278441308508, 'max_leaf_nodes': 15, 'min_samples_leaf': 8, 'l2_regularization': 7.424686425228803}. Best is trial 0 with value: 15.148658769842806.
[I 2025-04-09 04:37:43,776] Trial 2 finished with value: 15.097002148253342 and parameters: {'learning_rate': 0.16318909099461024, 'max_leaf_nodes': 15, 'min_samples_leaf': 4, 'l2_regularization': 4.30436135365296}. Best is trial 2 with value: 15.097002148253342.
[I 2025-04-09 04:37:53,841] Trial 3 finished with value: 15.185808415135181 and parameters: {'learning_rate': 0.10093360989108946, 'max_leaf_nodes': 15, 'min_samples_leaf': 18, 'l2_regularization': 0.259129

Best RMSE: 15.075496316417661
Best hyperparameters:
  learning_rate: 0.13295232895824177
  max_leaf_nodes: 31
  min_samples_leaf: 2
  l2_regularization: 9.000826977751071


In [32]:
params = {
    "early_stopping": False,
    "n_iter_no_change": 10,
    "max_iter": 200,
    "categorical_features": ds_train.categorical_features,
    #"verbose": 1
} | study.best_params
model = MultiOutputRegressor(HistGradientBoostingRegressor(**params))
model.fit(X, y)

In [33]:
preds_train = model.predict(X)
preds_test = model.predict(X_test)
rmse_train = root_mean_squared_error(y, preds_train)
rmse_test = root_mean_squared_error(y_test, preds_test)
print(rmse_train)
print(rmse_test)

13.786581572940042
15.326394351903105
