# Imports and definitions

# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Load data](#load-data)
3. [Prepare data](#prepare-data)
4. [Define objectives](#define-objectives)
5. [Start tuning](#start-tuning)
6. [Conclusion](#conclusion)

---

In [29]:
from pathlib import Path
import time

import polars as pl
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, f1_score

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
db_dir = 'sqlite:///data/models/{}.db'

# Load data

In [4]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 14 ms, sys: 14.7 ms, total: 28.7 ms
Wall time: 46.5 ms


# Prepare data

In [5]:
days_b_1 = train.filter(pl.col("contract_lifetime_days") < 1000)
days_b_2 = train.filter(pl.col("contract_lifetime_days") >= 1000)

In [6]:
X_1 = days_b_1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling', 'contract_lifetime_days']))
y_1 = days_b_1.select('has_done_upselling')

X_2 = days_b_2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling', 'contract_lifetime_days']))
y_2 = days_b_2.select('has_done_upselling')


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define objectives

In [8]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive_b_1 = (
    (y_1['has_done_upselling'] == False).sum() / (y_1['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive_b_1:", ratio_negative_to_positive_b_1)

ratio_negative_to_positive_b_1: 13.102610493667614


In [9]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive_b_2 = (
    (y_2['has_done_upselling'] == False).sum() / (y_2['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive_b_1:", ratio_negative_to_positive_b_2)

ratio_negative_to_positive_b_1: 13.371186440677967


In [10]:
def find_optimal_f1(valid_y, preds):
    # Find optimal threshold for F1
    precision, recall, thresholds = precision_recall_curve(valid_y, preds)
    f1_scores_thresh = 2 * (precision * recall) / (precision + recall + 1e-8)
    optimal_idx = np.argmax(f1_scores_thresh)
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
    
    # Make binary predictions using optimal threshold
    pred_labels = (preds >= optimal_threshold).astype(int)
    return f1_score(valid_y, pred_labels), optimal_threshold

In [11]:
def xgboost_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    XGBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    param = {
        'verbosity': 0,
        'n_jobs': 4,
        'early_stopping_rounds': 16,
        'eval_metric': 'aucpr',
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, ratio_negative_to_positive * 1.5, log=True),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),

        # L2 regularization weight.
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        # L1 regularization weight.
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        # sampling according to each tree.
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
    }

    if param['booster'] in ['gbtree', 'dart']:
        # maximum depth of the tree, signifies complexity of the tree.
        param['max_depth'] = trial.suggest_int('max_depth', 3, 20)
        # minimum child weight, larger the term more conservative the tree.
        param['min_child_weight'] = trial.suggest_int('min_child_weight', 2, 10)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)
    
    start_time = time.time()
    
    # Return mean F1 score across all folds
    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        # Create DMatrix objects
        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        
        # Train model
        bst = xgb.train(param, dtrain)
        
        # Make predictions
        preds = bst.predict(dvalid)
        
        f1, optimal_threshold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(float(optimal_threshold))

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', float(np.mean(optimal_thresholds)))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [12]:
def random_forest_objective(trial, X, y, skf, n_splits=5):
    '''
    Random Forest objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # Random Forest hyperparameters
    param = {
        'n_jobs': 4,
        'random_state': 42,
        'verbose': 0,

        # Core tree parameters
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),

        # Feature sampling parameters
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),

        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000),

        # Class balancing
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
    }

    rf = RandomForestClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        rf.fit(train_x, train_y)

        preds = rf.predict(valid_x)

        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)

In [13]:
def lightgbm_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    LightGBM objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # LightGBM hyperparameters
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': 0,
        'seed': 42,
        'num_threads': 4,
        'deterministic': True,
        
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),

    }
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(train_x, label=train_y)
        valid_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data)
        
        # Train model with early stopping
        model = lgb.train(
            param,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        
        # Make probability predictions
        preds = model.predict(valid_x, num_iteration=model.best_iteration)
        
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))
    
    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [14]:
def catboost_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    CatBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # CatBoost hyperparameters (simplified)
    param = {
        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
        'thread_count': 4,
        
        # Core boosting parameters
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        
        # Regularization and overfitting control
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),
        
        # Early stopping
        'early_stopping_rounds': 50,
        'eval_metric': 'F1',
    }
    
    # Create CatBoost classifier
    cb = CatBoostClassifier(**param)
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Train model with validation set for early stopping
        cb.fit(
            train_x, train_y,
            eval_set=(valid_x, valid_y),
            verbose=False
        )
        
        # Make probability predictions
        preds = cb.predict_proba(valid_x)[:, 1]  # Get probability of positive class
    
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [27]:
def histgb_objective(trial, X, y, skf, n_splits=5):
    '''
    HistGradientBoostingClassifier objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # HistGradientBoosting hyperparameters (simplified)
    param = {
        'random_state': 42,
        'verbose': 0,

        # Core boosting parameters
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),

        # Regularization
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'max_bins': trial.suggest_int('max_bins', 32, 255),

        # Early stopping
        'early_stopping': True,
        'n_iter_no_change': 10,
        'validation_fraction': 0.1,
    }

    hgb = HistGradientBoostingClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        hgb.fit(train_x, train_y)

        preds = hgb.predict(valid_x) 
        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


# Start tuning

In [15]:
# Setup
n_trials = 100

## XGBoost

In [16]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization_days_b_1",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

thresholds = xgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = xgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best XGB trial: {thresholds}")
print(f"Mean threshold for best XGB trial: {mean_threshold}")

[I 2025-07-01 20:34:53,937] A new study created in RDB with name: xgboost_optimization_days_b_1
[I 2025-07-01 20:34:54,756] Trial 0 finished with value: 0.14003769041691402 and parameters: {'scale_pos_weight': 18.479952284574306, 'booster': 'gbtree', 'n_estimators': 654, 'learning_rate': 0.006411143003551537, 'lambda': 9.118687817584196e-05, 'alpha': 2.2492140506519278e-07, 'subsample': 0.21351022105060374, 'colsample_bytree': 0.47944257576928645, 'max_depth': 16, 'min_child_weight': 3, 'eta': 2.898254553062837e-06, 'gamma': 7.464783278168506e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.14003769041691402.
[I 2025-07-01 20:34:55,779] Trial 1 finished with value: 0.1476309499964926 and parameters: {'scale_pos_weight': 3.142069555545725, 'booster': 'dart', 'n_estimators': 1106, 'learning_rate': 0.003645784651452519, 'lambda': 1.1300064914501678e-05, 'alpha': 0.002764232711791206, 'subsample': 0.46527857601916384, 'colsample_bytree': 0.588570411815096, 'max_depth': 17, '


Best XGB score: 0.17065123967693047
Best XGB params: {'scale_pos_weight': 1.7179604718936368, 'booster': 'gbtree', 'n_estimators': 280, 'learning_rate': 0.012817435438562848, 'lambda': 0.0013301634290872994, 'alpha': 0.00011369025223972573, 'subsample': 0.9435800449838017, 'colsample_bytree': 0.6927263365043989, 'max_depth': 3, 'min_child_weight': 2, 'eta': 1.945921035674146e-07, 'gamma': 0.002293388233350819, 'grow_policy': 'depthwise'}

Average execution time XGB: 0.46s
Total optimization time XGB: 45.59s

Optimal thresholds (per fold) for best XGB trial: [0.17111621797084808, 0.17236323654651642, 0.1709015965461731, 0.17225007712841034, 0.16932399570941925]
Mean threshold for best XGB trial: 0.17119102478027343


In [17]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization_days_b_2",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

thresholds = xgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = xgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best XGB trial: {thresholds}")
print(f"Mean threshold for best XGB trial: {mean_threshold}")

[I 2025-07-01 20:35:44,794] A new study created in RDB with name: xgboost_optimization_days_b_2
[I 2025-07-01 20:35:45,087] Trial 0 finished with value: 0.1469831430756449 and parameters: {'scale_pos_weight': 2.768542132754546, 'booster': 'gbtree', 'n_estimators': 717, 'learning_rate': 0.005288914144466376, 'lambda': 0.00032041654646144485, 'alpha': 0.1748781609383502, 'subsample': 0.24975874818534988, 'colsample_bytree': 0.8371696211183459, 'max_depth': 10, 'min_child_weight': 5, 'eta': 4.910227723740649e-05, 'gamma': 0.004526052838257007, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.1469831430756449.
[I 2025-07-01 20:35:45,201] Trial 1 finished with value: 0.13883919694743024 and parameters: {'scale_pos_weight': 2.4390254420857134, 'booster': 'gblinear', 'n_estimators': 313, 'learning_rate': 0.003428485113062843, 'lambda': 4.356730595973123e-06, 'alpha': 1.6228800985229064e-06, 'subsample': 0.7088638373643383, 'colsample_bytree': 0.8429491208227892}. Best is trial 0 wit


Best XGB score: 0.16044491090246993
Best XGB params: {'scale_pos_weight': 10.9651987015196, 'booster': 'dart', 'n_estimators': 1809, 'learning_rate': 0.005074296466231866, 'lambda': 2.0137422537048896e-07, 'alpha': 0.0009316548809837063, 'subsample': 0.8575950943773263, 'colsample_bytree': 0.7076785897731896, 'max_depth': 3, 'min_child_weight': 8, 'eta': 0.0003329644775209102, 'gamma': 0.0005589109188297087, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0002045216554197122, 'skip_drop': 1.8686724480487907e-07}

Average execution time XGB: 0.38s
Total optimization time XGB: 38.42s

Optimal thresholds (per fold) for best XGB trial: [0.4507444202899933, 0.4506193995475769, 0.4497022032737732, 0.45314839482307434, 0.4503605365753174]
Mean threshold for best XGB trial: 0.45091499090194703


## Random Forest

In [18]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization_days_b_1",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X_1, y_1, skf), n_trials=n_trials)  # Requires too much time to train one, reduce the number of trials

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

[I 2025-07-01 20:36:29,040] A new study created in RDB with name: random_forest_optimization_days_b_1
[I 2025-07-01 20:36:33,724] Trial 0 finished with value: 0.16227303779668922 and parameters: {'n_estimators': 383, 'max_depth': 11, 'min_samples_split': 2, 'min_samples_leaf': 9, 'min_weight_fraction_leaf': 0.14255763994820214, 'max_features': 'sqrt', 'max_samples': 0.1712477599319395, 'max_leaf_nodes': 995, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.16227303779668922.
[I 2025-07-01 20:36:36,444] Trial 1 finished with value: 0.1580665247974721 and parameters: {'n_estimators': 140, 'max_depth': 10, 'min_samples_split': 16, 'min_samples_leaf': 7, 'min_weight_fraction_leaf': 0.4433519658144712, 'max_features': 'log2', 'max_samples': 0.49514447373655623, 'max_leaf_nodes': 26, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.16227303779668922.
[I 2025-07-01 20:36:45,535] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 855, 'max_depth': 3


Best RF score: 0.16448486713725102
Best RF params: {'n_estimators': 269, 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.3013479079171481, 'max_features': 'sqrt', 'max_samples': 0.5029503508562481, 'max_leaf_nodes': 695, 'class_weight': 'balanced_subsample'}

Average execution time RF: 7.49s
Total optimization time RF: 748.54s


In [19]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization_days_b_2",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X_2, y_2, skf), n_trials=n_trials)  # Requires too much time to train one, reduce the number of trials

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

[I 2025-07-01 20:49:02,190] A new study created in RDB with name: random_forest_optimization_days_b_2
[I 2025-07-01 20:49:13,392] Trial 0 finished with value: 0.15539647752602603 and parameters: {'n_estimators': 989, 'max_depth': 3, 'min_samples_split': 17, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.23203282155393273, 'max_features': 'log2', 'max_samples': 0.30191585424109724, 'max_leaf_nodes': 405, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.15539647752602603.
[I 2025-07-01 20:49:35,612] Trial 1 finished with value: 0.1498252502322609 and parameters: {'n_estimators': 994, 'max_depth': 13, 'min_samples_split': 18, 'min_samples_leaf': 9, 'min_weight_fraction_leaf': 0.027955897616438796, 'max_features': 'sqrt', 'max_samples': 0.3954270931506458, 'max_leaf_nodes': 795, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 0.15539647752602603.
[I 2025-07-01 20:49:41,936] Trial 2 finished with value: 0.13989513615808397 and parameters: {'n_estimators': 


Best RF score: 0.1572531695995849
Best RF params: {'n_estimators': 972, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 9, 'min_weight_fraction_leaf': 0.16732343046149278, 'max_features': 'sqrt', 'max_samples': 0.3595018515416882, 'max_leaf_nodes': 37, 'class_weight': 'balanced_subsample'}

Average execution time RF: 11.96s
Total optimization time RF: 1195.81s


## LightGBM

In [20]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization_days_b_1",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

thresholds = lgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = lgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best LightGBM trial: {thresholds}")
print(f"Mean threshold for best LightGBM trial: {mean_threshold}")

[I 2025-07-01 21:09:03,317] A new study created in RDB with name: lightgbm_optimization_days_b_1
[I 2025-07-01 21:09:15,023] Trial 0 finished with value: 0.15030903557885922 and parameters: {'num_leaves': 245, 'learning_rate': 0.021650907263648014, 'feature_fraction': 0.9951017037262793, 'bagging_fraction': 0.9822240449651949, 'bagging_freq': 1, 'min_child_samples': 73, 'reg_alpha': 0.448722172883421, 'reg_lambda': 0.45502901850352173, 'scale_pos_weight': 13.340847838068107}. Best is trial 0 with value: 0.15030903557885922.
[I 2025-07-01 21:09:18,938] Trial 1 finished with value: 0.16048768622616635 and parameters: {'num_leaves': 57, 'learning_rate': 0.14414676993990472, 'feature_fraction': 0.9242972183432776, 'bagging_fraction': 0.5838937037336906, 'bagging_freq': 4, 'min_child_samples': 43, 'reg_alpha': 0.5532384184361494, 'reg_lambda': 0.3090242461531838, 'scale_pos_weight': 16.579720906268882}. Best is trial 1 with value: 0.16048768622616635.
[I 2025-07-01 21:09:21,325] Trial 2 fin



[I 2025-07-01 21:11:06,422] Trial 8 finished with value: 0.14628131164584787 and parameters: {'num_leaves': 209, 'learning_rate': 0.22703708395569533, 'feature_fraction': 0.5351232719180633, 'bagging_fraction': 0.6797886179729312, 'bagging_freq': 1, 'min_child_samples': 58, 'reg_alpha': 0.5358843186024449, 'reg_lambda': 0.802473504845369, 'scale_pos_weight': 19.17192692897521}. Best is trial 2 with value: 0.1635918630438221.
[I 2025-07-01 21:11:11,387] Trial 9 finished with value: 0.15014910237121462 and parameters: {'num_leaves': 61, 'learning_rate': 0.09849894115645826, 'feature_fraction': 0.6018520123681114, 'bagging_fraction': 0.9401704596129785, 'bagging_freq': 2, 'min_child_samples': 8, 'reg_alpha': 0.29588450945926015, 'reg_lambda': 0.13473231378577977, 'scale_pos_weight': 12.735745774389235}. Best is trial 2 with value: 0.1635918630438221.
[I 2025-07-01 21:11:13,908] Trial 10 finished with value: 0.1639236137978886 and parameters: {'num_leaves': 24, 'learning_rate': 0.045692177



[I 2025-07-01 21:15:24,178] Trial 50 finished with value: 0.14270478074046405 and parameters: {'num_leaves': 214, 'learning_rate': 0.09499756073592425, 'feature_fraction': 0.972588659833595, 'bagging_fraction': 0.5116268783617088, 'bagging_freq': 7, 'min_child_samples': 73, 'reg_alpha': 0.42573695193952876, 'reg_lambda': 0.39853534712406424, 'scale_pos_weight': 14.38669182844304}. Best is trial 41 with value: 0.1673116420529576.
[I 2025-07-01 21:15:27,683] Trial 51 finished with value: 0.1647034049445109 and parameters: {'num_leaves': 34, 'learning_rate': 0.1407385384321375, 'feature_fraction': 0.8080087789130549, 'bagging_fraction': 0.9240171859486783, 'bagging_freq': 7, 'min_child_samples': 77, 'reg_alpha': 0.48656253024294305, 'reg_lambda': 0.3100395462902904, 'scale_pos_weight': 16.235374174101192}. Best is trial 41 with value: 0.1673116420529576.
[I 2025-07-01 21:15:31,890] Trial 52 finished with value: 0.16440017222492745 and parameters: {'num_leaves': 57, 'learning_rate': 0.1107


Best LightGBM score: 0.1676130080597768
Best LightGBM params: {'num_leaves': 35, 'learning_rate': 0.16814794125617394, 'feature_fraction': 0.8151925596377104, 'bagging_fraction': 0.9704150221294421, 'bagging_freq': 7, 'min_child_samples': 74, 'reg_alpha': 0.1927259390667393, 'reg_lambda': 0.549829547246727, 'scale_pos_weight': 16.354358307978327}

Average execution time LightGBM: 5.61s
Total optimization time LightGBM: 560.51s

Optimal thresholds (per fold) for best LightGBM trial: [0.2462700727507547, 0.24479724465190608, 0.23307806305368967, 0.25098213249044565, 0.21973897685538643]
Mean threshold for best LightGBM trial: 0.2389732979604365


In [21]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization_days_b_2",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

thresholds = lgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = lgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best LightGBM trial: {thresholds}")
print(f"Mean threshold for best LightGBM trial: {mean_threshold}")

[I 2025-07-01 21:18:29,949] A new study created in RDB with name: lightgbm_optimization_days_b_2




[I 2025-07-01 21:18:36,908] Trial 0 finished with value: 0.13742922964273818 and parameters: {'num_leaves': 243, 'learning_rate': 0.2842133733893914, 'feature_fraction': 0.7253327709013281, 'bagging_fraction': 0.48176956831903384, 'bagging_freq': 4, 'min_child_samples': 56, 'reg_alpha': 0.32229559825224174, 'reg_lambda': 0.11940787097931926, 'scale_pos_weight': 10.462315572853134}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:18:43,381] Trial 1 finished with value: 0.13529608011837557 and parameters: {'num_leaves': 289, 'learning_rate': 0.11021786350239912, 'feature_fraction': 0.5963557038803489, 'bagging_fraction': 0.747296291023144, 'bagging_freq': 3, 'min_child_samples': 82, 'reg_alpha': 0.7810531529709013, 'reg_lambda': 0.7327830699484645, 'scale_pos_weight': 18.89364441656454}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:18:51,126] Trial 2 finished with value: 0.13434503886569754 and parameters: {'num_leaves': 208, 'learning_rate': 0.27466835025920977, 'feature_fraction': 0.46641610500415875, 'bagging_fraction': 0.48117170876940685, 'bagging_freq': 5, 'min_child_samples': 59, 'reg_alpha': 0.5447552941118468, 'reg_lambda': 0.5290543607253976, 'scale_pos_weight': 11.501579898974546}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:18:59,882] Trial 3 finished with value: 0.13405663294985332 and parameters: {'num_leaves': 154, 'learning_rate': 0.023021593335334135, 'feature_fraction': 0.4126741743074429, 'bagging_fraction': 0.9381888350492491, 'bagging_freq': 5, 'min_child_samples': 15, 'reg_alpha': 0.24022733408875685, 'reg_lambda': 0.9983502274008473, 'scale_pos_weight': 12.39171979172725}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:19:05,501] Trial 4 finished with value: 0.1363103277371433 and parameters: {'num_leaves': 242, 'learning_rate': 0.09837135184266328, 'feature_fraction': 0.5796432601866877, 'bagging_fraction': 0.6844476587573898, 'bagging_freq': 4, 'min_child_samples': 85, 'reg_alpha': 0.8574964233843261, 'reg_lambda': 0.5336260663761154, 'scale_pos_weight': 15.718478727860075}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:19:09,008] Trial 5 finished with value: 0.1371136975125351 and parameters: {'num_leaves': 76, 'learning_rate': 0.021898625013552823, 'feature_fraction': 0.6607333697688199, 'bagging_fraction': 0.4478236281300978, 'bagging_freq': 4, 'min_child_samples': 86, 'reg_alpha': 0.8901935600173504, 'reg_lambda': 0.8275859975775793, 'scale_pos_weight': 11.263662313634192}. Best is trial 0 with value: 0.13742922964273818.




[I 2025-07-01 21:19:13,234] Trial 6 finished with value: 0.13960834025401256 and parameters: {'num_leaves': 86, 'learning_rate': 0.0419501849105972, 'feature_fraction': 0.5976613552780692, 'bagging_fraction': 0.7691662652900274, 'bagging_freq': 4, 'min_child_samples': 57, 'reg_alpha': 0.8561410766587907, 'reg_lambda': 0.588363635678986, 'scale_pos_weight': 17.08119089781039}. Best is trial 6 with value: 0.13960834025401256.




[I 2025-07-01 21:19:20,014] Trial 7 finished with value: 0.13127980806417677 and parameters: {'num_leaves': 173, 'learning_rate': 0.02477177452489761, 'feature_fraction': 0.43354171298743127, 'bagging_fraction': 0.5656772498835041, 'bagging_freq': 2, 'min_child_samples': 57, 'reg_alpha': 0.05117872036073434, 'reg_lambda': 0.780388442558719, 'scale_pos_weight': 10.132284207670766}. Best is trial 6 with value: 0.13960834025401256.




[I 2025-07-01 21:19:26,169] Trial 8 finished with value: 0.134121503491672 and parameters: {'num_leaves': 214, 'learning_rate': 0.21123072202373963, 'feature_fraction': 0.5417252703163975, 'bagging_fraction': 0.6221171778710308, 'bagging_freq': 3, 'min_child_samples': 69, 'reg_alpha': 0.8604107129105619, 'reg_lambda': 0.25248828121635913, 'scale_pos_weight': 14.52240068886249}. Best is trial 6 with value: 0.13960834025401256.




[I 2025-07-01 21:19:30,895] Trial 9 finished with value: 0.13187901697240276 and parameters: {'num_leaves': 146, 'learning_rate': 0.016274484397555786, 'feature_fraction': 0.5113478405786946, 'bagging_fraction': 0.6591186342823034, 'bagging_freq': 2, 'min_child_samples': 95, 'reg_alpha': 0.738076816273844, 'reg_lambda': 0.9636809860047598, 'scale_pos_weight': 13.614112813865137}. Best is trial 6 with value: 0.13960834025401256.




[I 2025-07-01 21:19:32,821] Trial 10 finished with value: 0.15015406703612444 and parameters: {'num_leaves': 31, 'learning_rate': 0.048122920509557666, 'feature_fraction': 0.8958134730392663, 'bagging_fraction': 0.8411018784774279, 'bagging_freq': 7, 'min_child_samples': 29, 'reg_alpha': 0.5806033176757848, 'reg_lambda': 0.3134317375062985, 'scale_pos_weight': 19.25342388707973}. Best is trial 10 with value: 0.15015406703612444.
[I 2025-07-01 21:19:34,145] Trial 11 finished with value: 0.15100850068568855 and parameters: {'num_leaves': 20, 'learning_rate': 0.04826783399125169, 'feature_fraction': 0.9332742058639383, 'bagging_fraction': 0.8273757943507126, 'bagging_freq': 7, 'min_child_samples': 28, 'reg_alpha': 0.5470312018255601, 'reg_lambda': 0.3283107783207644, 'scale_pos_weight': 20.056393652074302}. Best is trial 11 with value: 0.15100850068568855.
[I 2025-07-01 21:19:35,859] Trial 12 finished with value: 0.1486282814783871 and parameters: {'num_leaves': 26, 'learning_rate': 0.052



[I 2025-07-01 21:21:21,393] Trial 34 finished with value: 0.13732279339778153 and parameters: {'num_leaves': 296, 'learning_rate': 0.24438477188696645, 'feature_fraction': 0.7069783572926555, 'bagging_fraction': 0.8030170875829881, 'bagging_freq': 5, 'min_child_samples': 32, 'reg_alpha': 0.2918185921960994, 'reg_lambda': 0.6294327013702754, 'scale_pos_weight': 12.44074132490048}. Best is trial 18 with value: 0.1532512574932455.




[I 2025-07-01 21:21:26,277] Trial 35 finished with value: 0.14495645125721657 and parameters: {'num_leaves': 98, 'learning_rate': 0.09308235555221231, 'feature_fraction': 0.9985040938563046, 'bagging_fraction': 0.9491274809781152, 'bagging_freq': 4, 'min_child_samples': 23, 'reg_alpha': 0.023218166355300246, 'reg_lambda': 0.1683075504236093, 'scale_pos_weight': 15.17908151713789}. Best is trial 18 with value: 0.1532512574932455.
[I 2025-07-01 21:21:35,916] Trial 36 finished with value: 0.13868912199639363 and parameters: {'num_leaves': 43, 'learning_rate': 0.2881860999059909, 'feature_fraction': 0.7478887399932844, 'bagging_fraction': 0.8596704849127614, 'bagging_freq': 5, 'min_child_samples': 76, 'reg_alpha': 0.11402383445017046, 'reg_lambda': 0.058635951658366825, 'scale_pos_weight': 16.640531118292653}. Best is trial 18 with value: 0.1532512574932455.
[I 2025-07-01 21:21:40,295] Trial 37 finished with value: 0.1416920394541163 and parameters: {'num_leaves': 78, 'learning_rate': 0.10



[I 2025-07-01 21:21:51,899] Trial 39 finished with value: 0.136077881177672 and parameters: {'num_leaves': 200, 'learning_rate': 0.05747799652401062, 'feature_fraction': 0.7984643117012681, 'bagging_fraction': 0.5442427304265771, 'bagging_freq': 4, 'min_child_samples': 42, 'reg_alpha': 0.26894595168840313, 'reg_lambda': 0.001640337288458124, 'scale_pos_weight': 13.768150347081264}. Best is trial 18 with value: 0.1532512574932455.




[I 2025-07-01 21:21:58,071] Trial 40 finished with value: 0.14195518911679322 and parameters: {'num_leaves': 138, 'learning_rate': 0.19161627073432264, 'feature_fraction': 0.7066209948200208, 'bagging_fraction': 0.9211960567273881, 'bagging_freq': 6, 'min_child_samples': 63, 'reg_alpha': 0.07500741822163284, 'reg_lambda': 0.2763328350190143, 'scale_pos_weight': 11.780288908482072}. Best is trial 18 with value: 0.1532512574932455.
[I 2025-07-01 21:22:01,439] Trial 41 finished with value: 0.15130615986690787 and parameters: {'num_leaves': 63, 'learning_rate': 0.07733750053797671, 'feature_fraction': 0.8256099403400541, 'bagging_fraction': 0.9409180333009712, 'bagging_freq': 6, 'min_child_samples': 8, 'reg_alpha': 0.1620815432707055, 'reg_lambda': 0.17076165351858452, 'scale_pos_weight': 15.282517682583567}. Best is trial 18 with value: 0.1532512574932455.
[I 2025-07-01 21:22:03,702] Trial 42 finished with value: 0.14861715569478245 and parameters: {'num_leaves': 41, 'learning_rate': 0.08


Best LightGBM score: 0.1532512574932455
Best LightGBM params: {'num_leaves': 59, 'learning_rate': 0.14802308897284228, 'feature_fraction': 0.8162034579535279, 'bagging_fraction': 0.9276862426700836, 'bagging_freq': 6, 'min_child_samples': 5, 'reg_alpha': 0.17305847344700126, 'reg_lambda': 0.02325507567358623, 'scale_pos_weight': 13.402168454752758}

Average execution time LightGBM: 4.19s
Total optimization time LightGBM: 419.21s

Optimal thresholds (per fold) for best LightGBM trial: [0.19674979600662598, 0.17455828737075663, 0.19525931910517513, 0.19075127367624847, 0.17028293650758844]
Mean threshold for best LightGBM trial: 0.18552032253327894


## Catboost

In [22]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization_days_b_1",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

thresholds = cat_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = cat_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best CatBoost trial: {thresholds}")
print(f"Mean threshold for best CatBoost trial: {mean_threshold}")

[I 2025-07-01 21:25:34,173] A new study created in RDB with name: catboost_optimization_days_b_1
[I 2025-07-01 21:25:39,785] Trial 0 finished with value: 0.16531395258472928 and parameters: {'iterations': 174, 'learning_rate': 0.03645179745563132, 'depth': 7, 'l2_leaf_reg': 7.329271140210006, 'bagging_temperature': 0.1740634426056148, 'random_strength': 4.872279116262831, 'scale_pos_weight': 12.920688689268191}. Best is trial 0 with value: 0.16531395258472928.
[I 2025-07-01 21:25:47,743] Trial 1 finished with value: 0.15102386057170544 and parameters: {'iterations': 763, 'learning_rate': 0.045708419107420004, 'depth': 10, 'l2_leaf_reg': 8.984454266765688, 'bagging_temperature': 0.5217299178122963, 'random_strength': 9.183971041120836, 'scale_pos_weight': 16.583896822120373}. Best is trial 0 with value: 0.16531395258472928.
[I 2025-07-01 21:25:53,349] Trial 2 finished with value: 0.16733906421735867 and parameters: {'iterations': 279, 'learning_rate': 0.20503184755265705, 'depth': 6, 'l


Best CatBoost score: 0.1727393104590596
Best CatBoost params: {'iterations': 431, 'learning_rate': 0.06790303154458478, 'depth': 5, 'l2_leaf_reg': 6.253787580374766, 'bagging_temperature': 0.681325297840026, 'random_strength': 0.8063588561160657, 'scale_pos_weight': 11.193332930268394}

Average execution time CatBoost: 3.36s
Total optimization time CatBoost: 336.39s

Optimal thresholds (per fold) for best CatBoost trial: [0.4996798380996801, 0.47748352740780187, 0.5056144610775708, 0.4972685388353668, 0.4709917874530761]
Mean threshold for best CatBoost trial: 0.4902076305746991


In [23]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization_days_b_2",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

thresholds = cat_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = cat_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best CatBoost trial: {thresholds}")
print(f"Mean threshold for best CatBoost trial: {mean_threshold}")

[I 2025-07-01 21:31:14,272] A new study created in RDB with name: catboost_optimization_days_b_2
[I 2025-07-01 21:31:17,667] Trial 0 finished with value: 0.15499391749979613 and parameters: {'iterations': 123, 'learning_rate': 0.10338706448484498, 'depth': 9, 'l2_leaf_reg': 7.967884419782507, 'bagging_temperature': 0.8087560981017219, 'random_strength': 1.3385032827024501, 'scale_pos_weight': 9.428868658006246}. Best is trial 0 with value: 0.15499391749979613.
[I 2025-07-01 21:31:19,196] Trial 1 finished with value: 0.15343546134048686 and parameters: {'iterations': 919, 'learning_rate': 0.16039550973529912, 'depth': 6, 'l2_leaf_reg': 6.3500043640462325, 'bagging_temperature': 0.7921936862271066, 'random_strength': 7.213418190659011, 'scale_pos_weight': 17.505248482019997}. Best is trial 0 with value: 0.15499391749979613.
[I 2025-07-01 21:31:20,429] Trial 2 finished with value: 0.1483701313094973 and parameters: {'iterations': 857, 'learning_rate': 0.2234183407819192, 'depth': 6, 'l2_l


Best CatBoost score: 0.16501432463710622
Best CatBoost params: {'iterations': 716, 'learning_rate': 0.020896790259801366, 'depth': 4, 'l2_leaf_reg': 6.2243232779799635, 'bagging_temperature': 0.45235282416776695, 'random_strength': 0.4663808306846739, 'scale_pos_weight': 13.088516532414157}

Average execution time CatBoost: 1.75s
Total optimization time CatBoost: 174.93s

Optimal thresholds (per fold) for best CatBoost trial: [0.5021554508693179, 0.5014255925362459, 0.511316446196657, 0.5012625956787097, 0.5227239149453411]
Mean threshold for best CatBoost trial: 0.5077768000452542


## HistGradientBoosting

In [30]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization_days_b_1",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X_1, y_1, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

thresholds = histgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best HistGB trial: {thresholds}")
print(f"Mean threshold for best HistGB trial: {mean_threshold}")

[I 2025-07-01 23:03:23,464] Using an existing study with name 'histgb_optimization_days_b_1' instead of creating a new one.


[I 2025-07-01 23:03:25,165] Trial 1 finished with value: 0.0 and parameters: {'max_iter': 206, 'learning_rate': 0.036893208758396606, 'max_depth': 4, 'min_samples_leaf': 74, 'l2_regularization': 0.8463766089516452, 'max_bins': 123}. Best is trial 1 with value: 0.0.
[I 2025-07-01 23:03:26,487] Trial 2 finished with value: 0.0 and parameters: {'max_iter': 207, 'learning_rate': 0.19028589274716107, 'max_depth': 9, 'min_samples_leaf': 28, 'l2_regularization': 0.2466854821940505, 'max_bins': 119}. Best is trial 1 with value: 0.0.
[I 2025-07-01 23:03:27,379] Trial 3 finished with value: 0.0 and parameters: {'max_iter': 184, 'learning_rate': 0.16498330603376885, 'max_depth': 5, 'min_samples_leaf': 54, 'l2_regularization': 0.05692720642834259, 'max_bins': 55}. Best is trial 1 with value: 0.0.
[I 2025-07-01 23:03:28,586] Trial 4 finished with value: 0.000516795865633075 and parameters: {'max_iter': 388, 'learning_rate': 0.25772100235353257, 'max_depth': 6, 'min_samples_leaf': 27, 'l2_regulariza


Best HistGB score: 0.004043035862642303
Best HistGB params: {'max_iter': 173, 'learning_rate': 0.2901475514215775, 'max_depth': 9, 'min_samples_leaf': 10, 'l2_regularization': 0.06789886253741798, 'max_bins': 110}

Average execution time HistGB: 1.52s
Total optimization time HistGB: 153.26s

Optimal thresholds (per fold) for best HistGB trial: []
Mean threshold for best HistGB trial: None


In [31]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization_days_b_2",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X_2, y_2, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

thresholds = histgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best HistGB trial: {thresholds}")
print(f"Mean threshold for best HistGB trial: {mean_threshold}")

[I 2025-07-01 23:05:59,998] A new study created in RDB with name: histgb_optimization_days_b_2
[I 2025-07-01 23:06:00,583] Trial 0 finished with value: 0.0 and parameters: {'max_iter': 369, 'learning_rate': 0.11583067017824922, 'max_depth': 4, 'min_samples_leaf': 19, 'l2_regularization': 0.25496787051070824, 'max_bins': 146}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:06:01,926] Trial 1 finished with value: 0.0 and parameters: {'max_iter': 122, 'learning_rate': 0.02038924430026663, 'max_depth': 3, 'min_samples_leaf': 85, 'l2_regularization': 0.4352512438946272, 'max_bins': 201}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:06:02,836] Trial 2 finished with value: 0.0 and parameters: {'max_iter': 463, 'learning_rate': 0.1404944859888819, 'max_depth': 7, 'min_samples_leaf': 19, 'l2_regularization': 0.18152548548352443, 'max_bins': 58}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:06:06,131] Trial 3 finished with value: 0.0 and parameters: {'max_iter': 148, 'learning_rate'


Best HistGB score: 0.0022535211267605635
Best HistGB params: {'max_iter': 100, 'learning_rate': 0.22352901585727447, 'max_depth': 6, 'min_samples_leaf': 10, 'l2_regularization': 0.21854538462755402, 'max_bins': 185}

Average execution time HistGB: 1.27s
Total optimization time HistGB: 127.33s

Optimal thresholds (per fold) for best HistGB trial: []
Mean threshold for best HistGB trial: None


# Conclusion

In [32]:
xgb_study = optuna.load_study(
    study_name="xgboost_optimization_days_b_1",
    storage=db_dir.format('xgb_study')
)
xgb_study_b2 = optuna.load_study(
    study_name="xgboost_optimization_days_b_2",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization_days_b_1",
    storage=db_dir.format('rf_study')
)
rf_study_b2 = optuna.load_study(
    study_name="random_forest_optimization_days_b_2",
    storage=db_dir.format('rf_study')
)
lgb_study = optuna.load_study(
    study_name="lightgbm_optimization_days_b_1",
    storage=db_dir.format('lgb_study')
)
lgb_study_b2 = optuna.load_study(
    study_name="lightgbm_optimization_days_b_2",
    storage=db_dir.format('lgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization_days_b_1",
    storage=db_dir.format('cat_study')
)
cat_study_b2 = optuna.load_study(
    study_name="catboost_optimization_days_b_2",
    storage=db_dir.format('cat_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization_days_b_1",
    storage=db_dir.format('histgb_study')
)
histgb_study_b2 = optuna.load_study(
    study_name="histgb_optimization_days_b_2",
    storage=db_dir.format('histgb_study')
)


In [33]:
print("XGBoost days_b_1 best score:", xgb_study.best_value)
print("XGBoost days_b_2 best score:", xgb_study_b2.best_value)

print("Random Forest days_b_1 best score:", rf_study.best_value)
print("Random Forest days_b_2 best score:", rf_study_b2.best_value)

print("LightGBM days_b_1 best score:", lgb_study.best_value)
print("LightGBM days_b_2 best score:", lgb_study_b2.best_value)

print("CatBoost days_b_1 best score:", cat_study.best_value)
print("CatBoost days_b_2 best score:", cat_study_b2.best_value)

print("HistGB days_b_1 best score:", histgb_study.best_value)
print("HistGB days_b_2 best score:", histgb_study_b2.best_value)

XGBoost days_b_1 best score: 0.17065123967693047
XGBoost days_b_2 best score: 0.16044491090246993
Random Forest days_b_1 best score: 0.16448486713725102
Random Forest days_b_2 best score: 0.1572531695995849
LightGBM days_b_1 best score: 0.1676130080597768
LightGBM days_b_2 best score: 0.1532512574932455
CatBoost days_b_1 best score: 0.1727393104590596
CatBoost days_b_2 best score: 0.16501432463710622
HistGB days_b_1 best score: 0.004043035862642303
HistGB days_b_2 best score: 0.0022535211267605635
