# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Load data](#load-data)
3. [Prepare data](#prepare-data)
4. [Define objectives](#define-objectives)
5. [Start tuning](#start-tuning)
6. [Conclusion](#conclusion)

---

# Imports and definitions

In [4]:
from pathlib import Path
import time

import polars as pl
import numpy as np

from sklearn.model_selection import StratifiedKFold
import numpy as np
import xgboost as xgb
from sklearn.metrics import precision_recall_curve, f1_score


from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [5]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
db_dir = 'sqlite:///data/models/{}.db'


# Load data

In [4]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 15 ms, sys: 7.74 ms, total: 22.7 ms
Wall time: 34.5 ms


# Prepare data

In [5]:
X = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = train.select('has_done_upselling')


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define objectives

In [6]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive = (
    (y['has_done_upselling'] == False).sum() / (y['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive:", ratio_negative_to_positive)

ratio_negative_to_positive: 13.186912573151268


In [8]:
def xgboost_objective(trial, X, y, skf, n_splits=5):
    '''
    XGBoost objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    param = {
        'verbosity': 0,
        'n_jobs': 4,
        'early_stopping_rounds': 16,
        'eval_metric': 'aucpr',
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),

        # L2 regularization weight.
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        # L1 regularization weight.
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        # sampling according to each tree.
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
    }

    if param['booster'] in ['gbtree', 'dart']:
        param['max_depth'] = trial.suggest_int('max_depth', 3, 20)
        param['min_child_weight'] = trial.suggest_int('min_child_weight', 2, 10)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)

        bst = xgb.train(param, dtrain)

        preds = bst.predict(dvalid)
        pred_labels = (preds >= 0.5).astype(int)
        f1 = f1_score(valid_y, pred_labels)

        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


In [None]:
def random_forest_objective(trial, X, y, skf, n_splits=5):
    '''
    Random Forest objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # Random Forest hyperparameters
    param = {
        'n_jobs': 4,
        'random_state': 42,
        'verbose': 0,

        # Core tree parameters
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),

        # Feature sampling parameters
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),

        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000),

        # Class balancing
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
    }

    rf = RandomForestClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        rf.fit(train_x, train_y)

        preds = rf.predict(valid_x)

        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


In [None]:
def histgb_objective(trial, X, y, skf, n_splits=5):
    '''
    HistGradientBoostingClassifier objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # HistGradientBoosting hyperparameters (simplified)
    param = {
        'random_state': 42,
        'verbose': 0,

        # Core boosting parameters
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),

        # Regularization
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'max_bins': trial.suggest_int('max_bins', 32, 255),

        # Early stopping
        'early_stopping': True,
        'n_iter_no_change': 10,
        'validation_fraction': 0.1,
    }

    hgb = HistGradientBoostingClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        hgb.fit(train_x, train_y)

        preds = hgb.predict(valid_x) 
        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


In [11]:
def lightgbm_objective(trial, X, y, skf, n_splits=5):
    '''
    LightGBM objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # LightGBM hyperparameters
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': 0,
        'seed': 42,
        'num_threads': 4,
        'deterministic': True,

        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),

        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),
    }

    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        train_data = lgb.Dataset(train_x, label=train_y)
        valid_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data)

        model = lgb.train(
            param,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )

        preds = model.predict(valid_x, num_iteration=model.best_iteration)
        pred_labels = (preds >= 0.5).astype(int)
        f1 = f1_score(valid_y, pred_labels)

        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


In [12]:
def catboost_objective(trial, X, y, skf, n_splits=5):
    '''
    CatBoost objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # CatBoost hyperparameters (simplified)
    param = {
        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
        'thread_count': 4,

        # Core boosting parameters
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),

        # Regularization and overfitting control
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),

        # Early stopping
        'early_stopping_rounds': 50,
        'eval_metric': 'F1',
    }

    cb = CatBoostClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        cb.fit(
            train_x, train_y,
            eval_set=(valid_x, valid_y),
            verbose=False
        )

        preds = cb.predict_proba(valid_x)[:, 1]
        pred_labels = (preds >= 0.5).astype(int)
        f1 = f1_score(valid_y, pred_labels)

        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


# Start tuning

In [None]:
# Setup
n_trials = 150

In [14]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization_basef1",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

[I 2025-07-01 12:27:01,307] A new study created in RDB with name: xgboost_optimization_basef1
[I 2025-07-01 12:27:03,440] Trial 0 finished with value: 0.14007246446408964 and parameters: {'scale_pos_weight': 17.959628053601357, 'booster': 'dart', 'n_estimators': 1699, 'learning_rate': 0.03702882745302137, 'lambda': 7.691946948678017e-05, 'alpha': 6.395085473663006e-07, 'subsample': 0.9913005910443302, 'colsample_bytree': 0.6364757385383926, 'max_depth': 16, 'min_child_weight': 7, 'eta': 1.178524491092608e-06, 'gamma': 0.0038945294874322012, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 9.254210043058606e-05, 'skip_drop': 0.00014708455577775717}. Best is trial 0 with value: 0.14007246446408964.
[I 2025-07-01 12:27:03,796] Trial 1 finished with value: 0.13169233541648787 and parameters: {'scale_pos_weight': 16.094355707249722, 'booster': 'gblinear', 'n_estimators': 591, 'learning_rate': 0.011502412962882676, 'lambda': 0.00521101910364952, 


Best XGB score: 0.16792221234900015
Best XGB params: {'scale_pos_weight': 11.64847032776273, 'booster': 'gbtree', 'n_estimators': 1424, 'learning_rate': 0.18151466082451148, 'lambda': 0.9690368683449678, 'alpha': 0.002986728256573267, 'subsample': 0.8619703864495335, 'colsample_bytree': 0.9796706593574434, 'max_depth': 4, 'min_child_weight': 8, 'eta': 0.0007282719538668855, 'gamma': 1.2631117722324442e-06, 'grow_policy': 'lossguide'}

Average execution time XGB: 1.04s
Total optimization time XGB: 155.52s


In [15]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization_basef1",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

[I 2025-07-01 12:29:45,293] A new study created in RDB with name: random_forest_optimization_basef1
[I 2025-07-01 12:30:00,723] Trial 0 finished with value: 0.16108648285868837 and parameters: {'n_estimators': 347, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 8, 'min_weight_fraction_leaf': 0.12163904033097872, 'max_features': 'sqrt', 'max_samples': 0.9965349584288773, 'max_leaf_nodes': 46, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.16108648285868837.
[I 2025-07-01 12:30:04,771] Trial 1 finished with value: 0.1620349059254677 and parameters: {'n_estimators': 189, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 2, 'min_weight_fraction_leaf': 0.09352908659404302, 'max_features': 'log2', 'max_samples': 0.15765220523500748, 'max_leaf_nodes': 652, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.1620349059254677.
[I 2025-07-01 12:30:32,683] Trial 2 finished with value: 0.16404267728012947 and parameters: {'n_estimators


Best RF score: 0.1669563556238734
Best RF params: {'n_estimators': 864, 'max_depth': 19, 'min_samples_split': 19, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.015327799848051129, 'max_features': 'sqrt', 'max_samples': 0.2900931234940456, 'max_leaf_nodes': 154, 'class_weight': 'balanced'}

Average execution time RF: 25.68s
Total optimization time RF: 3851.40s


In [16]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization_basef1",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

[I 2025-07-01 13:34:03,541] A new study created in RDB with name: histgb_optimization_basef1
[I 2025-07-01 13:34:14,063] Trial 0 finished with value: 0.0 and parameters: {'max_iter': 243, 'learning_rate': 0.031198398220887345, 'max_depth': 4, 'min_samples_leaf': 100, 'l2_regularization': 0.6668262373207421, 'max_bins': 92}. Best is trial 0 with value: 0.0.
[I 2025-07-01 13:34:40,117] Trial 1 finished with value: 0.0 and parameters: {'max_iter': 153, 'learning_rate': 0.022763706978379483, 'max_depth': 8, 'min_samples_leaf': 36, 'l2_regularization': 0.19230050007760757, 'max_bins': 245}. Best is trial 0 with value: 0.0.
[I 2025-07-01 13:35:18,463] Trial 2 finished with value: 0.0 and parameters: {'max_iter': 169, 'learning_rate': 0.013885655009320897, 'max_depth': 9, 'min_samples_leaf': 94, 'l2_regularization': 0.9642288953939487, 'max_bins': 143}. Best is trial 0 with value: 0.0.
[I 2025-07-01 13:35:37,803] Trial 3 finished with value: 0.0 and parameters: {'max_iter': 349, 'learning_rat


Best HistGB score: 0.0
Best HistGB params: {'max_iter': 243, 'learning_rate': 0.031198398220887345, 'max_depth': 4, 'min_samples_leaf': 100, 'l2_regularization': 0.6668262373207421, 'max_bins': 92}

Average execution time HistGB: 11.22s
Total optimization time HistGB: 1682.59s


In [17]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization_basef1",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

[I 2025-07-01 14:02:11,676] A new study created in RDB with name: lightgbm_optimization_basef1
[I 2025-07-01 14:02:16,893] Trial 0 finished with value: 0.0 and parameters: {'num_leaves': 108, 'learning_rate': 0.029141699139667094, 'feature_fraction': 0.5902644610241378, 'bagging_fraction': 0.4114176649942406, 'bagging_freq': 5, 'min_child_samples': 67, 'reg_alpha': 0.659622415665781, 'reg_lambda': 0.1884845018259208, 'scale_pos_weight': 19.233273742245892}. Best is trial 0 with value: 0.0.
[I 2025-07-01 14:02:23,470] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 126, 'learning_rate': 0.06618219019556781, 'feature_fraction': 0.6238563913707307, 'bagging_fraction': 0.8213643124276205, 'bagging_freq': 3, 'min_child_samples': 18, 'reg_alpha': 0.9321057313017755, 'reg_lambda': 0.8111553497258805, 'scale_pos_weight': 10.343745749945798}. Best is trial 0 with value: 0.0.
[I 2025-07-01 14:02:28,866] Trial 2 finished with value: 0.0 and parameters: {'num_leaves': 110, 'learnin



[I 2025-07-01 14:03:10,970] Trial 8 finished with value: 0.0 and parameters: {'num_leaves': 291, 'learning_rate': 0.025568941076938405, 'feature_fraction': 0.49067707828753565, 'bagging_fraction': 0.4192295700102029, 'bagging_freq': 2, 'min_child_samples': 62, 'reg_alpha': 0.7571830007752852, 'reg_lambda': 0.8115832478184745, 'scale_pos_weight': 13.718837745472912}. Best is trial 0 with value: 0.0.
[I 2025-07-01 14:03:16,197] Trial 9 finished with value: 0.0 and parameters: {'num_leaves': 96, 'learning_rate': 0.1864551472590932, 'feature_fraction': 0.5553212802708843, 'bagging_fraction': 0.7433528761768116, 'bagging_freq': 4, 'min_child_samples': 10, 'reg_alpha': 0.3704539926896133, 'reg_lambda': 0.054065977569773604, 'scale_pos_weight': 18.155412757656197}. Best is trial 0 with value: 0.0.
[I 2025-07-01 14:03:24,540] Trial 10 finished with value: 0.0 and parameters: {'num_leaves': 207, 'learning_rate': 0.0347231211546126, 'feature_fraction': 0.8161667876647327, 'bagging_fraction': 0.6



[I 2025-07-01 14:10:55,528] Trial 40 finished with value: 0.0 and parameters: {'num_leaves': 256, 'learning_rate': 0.2329498689852333, 'feature_fraction': 0.77637308435654, 'bagging_fraction': 0.5575377763012762, 'bagging_freq': 4, 'min_child_samples': 81, 'reg_alpha': 0.01891940347312257, 'reg_lambda': 0.9839315645159785, 'scale_pos_weight': 13.151218691180393}. Best is trial 15 with value: 0.08365995454795136.
[I 2025-07-01 14:11:23,261] Trial 41 finished with value: 0.06413074833004477 and parameters: {'num_leaves': 232, 'learning_rate': 0.26330618968572983, 'feature_fraction': 0.914177487685609, 'bagging_fraction': 0.8124732007723093, 'bagging_freq': 6, 'min_child_samples': 45, 'reg_alpha': 0.19686448429024997, 'reg_lambda': 0.5681719755788776, 'scale_pos_weight': 18.41552343667663}. Best is trial 15 with value: 0.08365995454795136.
[I 2025-07-01 14:11:42,609] Trial 42 finished with value: 0.04360635645447612 and parameters: {'num_leaves': 202, 'learning_rate': 0.24892932442560217,



[I 2025-07-01 14:21:12,239] Trial 86 finished with value: 0.0 and parameters: {'num_leaves': 210, 'learning_rate': 0.212921795223609, 'feature_fraction': 0.9414805894565605, 'bagging_fraction': 0.4600834931311478, 'bagging_freq': 6, 'min_child_samples': 95, 'reg_alpha': 0.9016069680552615, 'reg_lambda': 0.40704719040807713, 'scale_pos_weight': 14.463960072123728}. Best is trial 65 with value: 0.12437842284086162.
[I 2025-07-01 14:21:17,467] Trial 87 finished with value: 0.03910580218769876 and parameters: {'num_leaves': 170, 'learning_rate': 0.243855025820333, 'feature_fraction': 0.9004282604084886, 'bagging_fraction': 0.43956242003986223, 'bagging_freq': 7, 'min_child_samples': 90, 'reg_alpha': 0.8362610103487303, 'reg_lambda': 0.13760633684036733, 'scale_pos_weight': 15.3628955867332}. Best is trial 65 with value: 0.12437842284086162.
[I 2025-07-01 14:21:22,292] Trial 88 finished with value: 0.07564550961615588 and parameters: {'num_leaves': 154, 'learning_rate': 0.271999021175914, '



[I 2025-07-01 14:22:05,258] Trial 93 finished with value: 0.08469326516574867 and parameters: {'num_leaves': 185, 'learning_rate': 0.25144805958872535, 'feature_fraction': 0.9510727843163613, 'bagging_fraction': 0.4259900428016661, 'bagging_freq': 7, 'min_child_samples': 96, 'reg_alpha': 0.6931203281842412, 'reg_lambda': 0.4249404987709925, 'scale_pos_weight': 15.45998001371236}. Best is trial 65 with value: 0.12437842284086162.
[I 2025-07-01 14:22:10,645] Trial 94 finished with value: 0.0 and parameters: {'num_leaves': 176, 'learning_rate': 0.16618152848672205, 'feature_fraction': 0.9666467737583317, 'bagging_fraction': 0.4095246763754943, 'bagging_freq': 7, 'min_child_samples': 92, 'reg_alpha': 0.7164449656102765, 'reg_lambda': 0.4526278880811908, 'scale_pos_weight': 17.175422072030894}. Best is trial 65 with value: 0.12437842284086162.
[I 2025-07-01 14:22:16,817] Trial 95 finished with value: 0.0 and parameters: {'num_leaves': 204, 'learning_rate': 0.19139652739295288, 'feature_frac



[I 2025-07-01 14:22:28,206] Trial 97 finished with value: 0.12727512623294643 and parameters: {'num_leaves': 215, 'learning_rate': 0.27476346913980804, 'feature_fraction': 0.9491305219784584, 'bagging_fraction': 0.4602777277263444, 'bagging_freq': 7, 'min_child_samples': 100, 'reg_alpha': 0.6825510538446933, 'reg_lambda': 0.10930103913726716, 'scale_pos_weight': 15.8258764706305}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:22:33,933] Trial 98 finished with value: 0.032608035885158 and parameters: {'num_leaves': 192, 'learning_rate': 0.23335025019956823, 'feature_fraction': 0.8863285282205181, 'bagging_fraction': 0.46026353880607307, 'bagging_freq': 6, 'min_child_samples': 90, 'reg_alpha': 0.6483187458768833, 'reg_lambda': 0.10699696105356604, 'scale_pos_weight': 15.876212353148194}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:22:35,230] Trial 99 finished with value: 0.0 and parameters: {'num_leaves': 22, 'learning_rate': 0.2086842700956043



[I 2025-07-01 14:23:06,434] Trial 102 finished with value: 0.11569830574000609 and parameters: {'num_leaves': 242, 'learning_rate': 0.26207370751651227, 'feature_fraction': 0.9465153003642692, 'bagging_fraction': 0.4751548222627245, 'bagging_freq': 7, 'min_child_samples': 100, 'reg_alpha': 0.7660786537353719, 'reg_lambda': 0.5115139208098892, 'scale_pos_weight': 15.57499382215591}. Best is trial 97 with value: 0.12727512623294643.




[I 2025-07-01 14:23:20,954] Trial 103 finished with value: 0.09127500570276194 and parameters: {'num_leaves': 241, 'learning_rate': 0.2572495175534251, 'feature_fraction': 0.8610379631365852, 'bagging_fraction': 0.47729545182346084, 'bagging_freq': 7, 'min_child_samples': 43, 'reg_alpha': 0.8359047875818032, 'reg_lambda': 0.14824978060003516, 'scale_pos_weight': 15.441268286453147}. Best is trial 97 with value: 0.12727512623294643.




[I 2025-07-01 14:23:27,239] Trial 104 finished with value: 0.04788352591086333 and parameters: {'num_leaves': 247, 'learning_rate': 0.2502551950615619, 'feature_fraction': 0.857260629582973, 'bagging_fraction': 0.4715183454669805, 'bagging_freq': 7, 'min_child_samples': 100, 'reg_alpha': 0.8493559838833464, 'reg_lambda': 0.5167651030038042, 'scale_pos_weight': 14.245841440411859}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:23:33,989] Trial 105 finished with value: 0.0 and parameters: {'num_leaves': 242, 'learning_rate': 0.010928472984695068, 'feature_fraction': 0.9011001448640134, 'bagging_fraction': 0.4514129188703532, 'bagging_freq': 7, 'min_child_samples': 42, 'reg_alpha': 0.9443930729479382, 'reg_lambda': 0.13408819767061045, 'scale_pos_weight': 15.24156111907403}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:23:40,488] Trial 106 finished with value: 0.03772027383886028 and parameters: {'num_leaves': 226, 'learning_rate': 0.226901311299



[I 2025-07-01 14:26:56,183] Trial 123 finished with value: 0.09988658806390355 and parameters: {'num_leaves': 200, 'learning_rate': 0.24208797298605308, 'feature_fraction': 0.9584991562291042, 'bagging_fraction': 0.40006433752426185, 'bagging_freq': 7, 'min_child_samples': 91, 'reg_alpha': 0.9913354665993771, 'reg_lambda': 0.22470763637548252, 'scale_pos_weight': 17.499817982064652}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:27:13,700] Trial 124 finished with value: 0.08938019969516028 and parameters: {'num_leaves': 176, 'learning_rate': 0.29721702793670324, 'feature_fraction': 0.9425950493057165, 'bagging_fraction': 0.45702896316877173, 'bagging_freq': 7, 'min_child_samples': 39, 'reg_alpha': 0.8673767841746708, 'reg_lambda': 0.3330379177367986, 'scale_pos_weight': 15.848856882039163}. Best is trial 97 with value: 0.12727512623294643.
[I 2025-07-01 14:27:20,309] Trial 125 finished with value: 0.09297609186583057 and parameters: {'num_leaves': 215, 'learning_ra


Best LightGBM score: 0.1307735191277124
Best LightGBM params: {'num_leaves': 159, 'learning_rate': 0.28054343791849756, 'feature_fraction': 0.9664319955784927, 'bagging_fraction': 0.46580090656089546, 'bagging_freq': 7, 'min_child_samples': 97, 'reg_alpha': 0.7231423095015118, 'reg_lambda': 0.1850374445599981, 'scale_pos_weight': 16.4198899310671}

Average execution time LightGBM: 11.18s
Total optimization time LightGBM: 1677.52s


In [18]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization_basef1",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

[I 2025-07-01 14:30:14,576] A new study created in RDB with name: catboost_optimization_basef1
[I 2025-07-01 14:30:18,512] Trial 0 finished with value: 0.16610686920550702 and parameters: {'iterations': 656, 'learning_rate': 0.28751902681286345, 'depth': 5, 'l2_leaf_reg': 6.037154408757308, 'bagging_temperature': 0.6508926878297332, 'random_strength': 6.709361692283231, 'scale_pos_weight': 10.806532250161368}. Best is trial 0 with value: 0.16610686920550702.
[I 2025-07-01 14:30:22,601] Trial 1 finished with value: 0.15874483996341976 and parameters: {'iterations': 338, 'learning_rate': 0.27287284990735466, 'depth': 5, 'l2_leaf_reg': 2.3103261813990636, 'bagging_temperature': 0.954540357978238, 'random_strength': 5.479479104260786, 'scale_pos_weight': 9.83575560089471}. Best is trial 0 with value: 0.16610686920550702.
[I 2025-07-01 14:30:25,037] Trial 2 finished with value: 0.14790919344226547 and parameters: {'iterations': 165, 'learning_rate': 0.07978975305390953, 'depth': 5, 'l2_leaf


Best CatBoost score: 0.16896046200347054
Best CatBoost params: {'iterations': 486, 'learning_rate': 0.13528485417531405, 'depth': 6, 'l2_leaf_reg': 4.832572033297634, 'bagging_temperature': 0.3319519715148951, 'random_strength': 0.0023388056511300936, 'scale_pos_weight': 11.52515681907139}

Average execution time CatBoost: 2.77s
Total optimization time CatBoost: 414.93s


# Conclusion

In [9]:
# Load all Optuna studies from their respective databases
xgb_study = optuna.load_study(
    study_name="xgboost_optimization_basef1",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization_basef1",
    storage=db_dir.format('rf_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization_basef1",
    storage=db_dir.format('histgb_study')
)
lgb_study = optuna.load_study(
    study_name="lightgbm_optimization_basef1",
    storage=db_dir.format('lgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization_basef1",
    storage=db_dir.format('cat_study')
)

In [10]:
print(f"Best XGBoost score: {xgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]):.2f}s")
print(f"Best Random Forest score: {rf_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in rf_study.trials]):.2f}s")
print(f"Best HistGB score: {histgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]):.2f}s")
print(f"Best LightGBM score: {lgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]):.2f}s")
print(f"Best CatBoost score: {cat_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in cat_study.trials]):.2f}s")

Best XGBoost score: 0.168, Avg time: 1.04s
Best Random Forest score: 0.167, Avg time: 25.68s
Best HistGB score: 0.000, Avg time: 11.22s
Best LightGBM score: 0.131, Avg time: 11.18s
Best CatBoost score: 0.169, Avg time: 2.77s
