# Table of Contents
1. [Imports and definitions](#imports-and-definitions)
2. [Load data](#load-data)
3. [Prepare data](#prepare-data)
4. [Define objectives](#define-objectives)
5. [Start tuning](#start-tuning)
6. [Conclusion](#conclusion)

# Imports and definitions

In [27]:
from pathlib import Path
import time

import polars as pl
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, f1_score

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
db_dir = 'sqlite:///data/models/{}.db'

# Load data

In [4]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 15 ms, sys: 5.66 ms, total: 20.7 ms
Wall time: 24.6 ms


# Prepare data

In [5]:
data_b_1 = train.filter(pl.col("available_gb") < 25)
data_b_2 = train.filter(pl.col("available_gb") >= 25)

In [6]:
X_1 = data_b_1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling', 'available_gb']))
y_1 = data_b_1.select('has_done_upselling')

X_2 = data_b_2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling', 'available_gb']))
y_2 = data_b_2.select('has_done_upselling')


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define objectives

In [8]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive_b_1 = (
    (y_1['has_done_upselling'] == False).sum() / (y_1['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive_b_1:", ratio_negative_to_positive_b_1)

ratio_negative_to_positive_b_1: 10.563211319593972


In [9]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive_b_2 = (
    (y_2['has_done_upselling'] == False).sum() / (y_2['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive_b_1:", ratio_negative_to_positive_b_2)

ratio_negative_to_positive_b_1: 16.758793969849247


In [10]:
def find_optimal_f1(valid_y, preds):
    # Find optimal threshold for F1
    precision, recall, thresholds = precision_recall_curve(valid_y, preds)
    f1_scores_thresh = 2 * (precision * recall) / (precision + recall + 1e-8)
    optimal_idx = np.argmax(f1_scores_thresh)
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
    
    # Make binary predictions using optimal threshold
    pred_labels = (preds >= optimal_threshold).astype(int)
    return f1_score(valid_y, pred_labels), optimal_threshold

In [11]:
def xgboost_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    XGBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    param = {
        'verbosity': 0,
        'n_jobs': 4,
        'early_stopping_rounds': 16,
        'eval_metric': 'aucpr',
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, ratio_negative_to_positive * 1.5, log=True),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),

        # L2 regularization weight.
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        # L1 regularization weight.
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        # sampling according to each tree.
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
    }

    if param['booster'] in ['gbtree', 'dart']:
        # maximum depth of the tree, signifies complexity of the tree.
        param['max_depth'] = trial.suggest_int('max_depth', 3, 20)
        # minimum child weight, larger the term more conservative the tree.
        param['min_child_weight'] = trial.suggest_int('min_child_weight', 2, 10)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)
    
    start_time = time.time()
    
    # Return mean F1 score across all folds
    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        # Create DMatrix objects
        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        
        # Train model
        bst = xgb.train(param, dtrain)
        
        # Make predictions
        preds = bst.predict(dvalid)
        
        f1, optimal_threshold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(float(optimal_threshold))

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', float(np.mean(optimal_thresholds)))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [12]:
def random_forest_objective(trial, X, y, skf, n_splits=5):
    '''
    Random Forest objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # Random Forest hyperparameters
    param = {
        'n_jobs': 4,
        'random_state': 42,
        'verbose': 0,

        # Core tree parameters
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),

        # Feature sampling parameters
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),

        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000),

        # Class balancing
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
    }

    rf = RandomForestClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        rf.fit(train_x, train_y)

        preds = rf.predict(valid_x)

        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)

In [13]:
def lightgbm_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    LightGBM objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # LightGBM hyperparameters
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': 0,
        'seed': 42,
        'num_threads': 4,
        'deterministic': True,
        
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),

    }
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(train_x, label=train_y)
        valid_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data)
        
        # Train model with early stopping
        model = lgb.train(
            param,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        
        # Make probability predictions
        preds = model.predict(valid_x, num_iteration=model.best_iteration)
        
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))
    
    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [14]:
def catboost_objective(trial, X, y, skf, ratio_negative_to_positive, n_splits=5):
    '''
    CatBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # CatBoost hyperparameters (simplified)
    param = {
        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
        'thread_count': 4,
        
        # Core boosting parameters
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        
        # Regularization and overfitting control
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),
        
        # Early stopping
        'early_stopping_rounds': 50,
        'eval_metric': 'F1',
    }
    
    # Create CatBoost classifier
    cb = CatBoostClassifier(**param)
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Train model with validation set for early stopping
        cb.fit(
            train_x, train_y,
            eval_set=(valid_x, valid_y),
            verbose=False
        )
        
        # Make probability predictions
        preds = cb.predict_proba(valid_x)[:, 1]  # Get probability of positive class
    
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [26]:
def histgb_objective(trial, X, y, skf, n_splits=5):
    '''
    HistGradientBoostingClassifier objective function using stratified cross-validation

    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''

    cv_scores = []

    # HistGradientBoosting hyperparameters (simplified)
    param = {
        'random_state': 42,
        'verbose': 0,

        # Core boosting parameters
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),

        # Regularization
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'max_bins': trial.suggest_int('max_bins', 32, 255),

        # Early stopping
        'early_stopping': True,
        'n_iter_no_change': 10,
        'validation_fraction': 0.1,
    }

    hgb = HistGradientBoostingClassifier(**param)

    start_time = time.time()

    for train_idx, valid_idx in skf.split(X, y):
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        hgb.fit(train_x, train_y)

        preds = hgb.predict(valid_x) 
        f1 = f1_score(valid_y, preds)
        cv_scores.append(f1)

    execution_time = time.time() - start_time

    trial.set_user_attr('execution_time', execution_time)

    return np.mean(cv_scores)


# Start tuning

In [15]:
# Setup
n_trials = 100

## XGBoost

In [16]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization_data_b_1",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

thresholds = xgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = xgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best XGB trial: {thresholds}")
print(f"Mean threshold for best XGB trial: {mean_threshold}")

[I 2025-07-01 20:39:25,738] A new study created in RDB with name: xgboost_optimization_data_b_1
[I 2025-07-01 20:39:26,391] Trial 0 finished with value: 0.16864998511580137 and parameters: {'scale_pos_weight': 4.7690869945527945, 'booster': 'dart', 'n_estimators': 189, 'learning_rate': 0.00106222025645763, 'lambda': 0.0032376355280505985, 'alpha': 4.572109113260734e-05, 'subsample': 0.320143569081284, 'colsample_bytree': 0.8568263748920281, 'max_depth': 8, 'min_child_weight': 5, 'eta': 0.013908924176063627, 'gamma': 1.8689300953003695e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.5213578393684243, 'skip_drop': 2.2725487262136113e-08}. Best is trial 0 with value: 0.16864998511580137.
[I 2025-07-01 20:39:30,123] Trial 1 finished with value: 0.16582888885200436 and parameters: {'scale_pos_weight': 3.327095182892596, 'booster': 'gbtree', 'n_estimators': 482, 'learning_rate': 0.004568761774668208, 'lambda': 0.0007213846210714697, 'alpha'


Best XGB score: 0.18482948619990122
Best XGB params: {'scale_pos_weight': 2.9363461462126463, 'booster': 'gbtree', 'n_estimators': 1814, 'learning_rate': 0.004433705610227668, 'lambda': 7.633781795471936e-07, 'alpha': 5.744704944269904e-07, 'subsample': 0.8957921916551345, 'colsample_bytree': 0.558825383992462, 'max_depth': 3, 'min_child_weight': 7, 'eta': 0.13593244551365197, 'gamma': 5.556919522307276e-06, 'grow_policy': 'lossguide'}

Average execution time XGB: 0.76s
Total optimization time XGB: 75.98s

Optimal thresholds (per fold) for best XGB trial: [0.24337531626224518, 0.24321404099464417, 0.2430262714624405, 0.24376191198825836, 0.24333372712135315]
Mean threshold for best XGB trial: 0.24334225356578826


In [17]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization_data_b_2",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

thresholds = xgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = xgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best XGB trial: {thresholds}")
print(f"Mean threshold for best XGB trial: {mean_threshold}")

[I 2025-07-01 20:40:48,227] A new study created in RDB with name: xgboost_optimization_data_b_2
[I 2025-07-01 20:40:48,502] Trial 0 finished with value: 0.1171181293162401 and parameters: {'scale_pos_weight': 13.613003022487161, 'booster': 'gblinear', 'n_estimators': 759, 'learning_rate': 0.003964951936174337, 'lambda': 0.00030280670471308777, 'alpha': 0.13848388290481137, 'subsample': 0.8506021460043437, 'colsample_bytree': 0.2099855759376137}. Best is trial 0 with value: 0.1171181293162401.
[I 2025-07-01 20:40:49,218] Trial 1 finished with value: 0.13670041824326312 and parameters: {'scale_pos_weight': 16.787696873771875, 'booster': 'gbtree', 'n_estimators': 1801, 'learning_rate': 0.05334729647626753, 'lambda': 1.109721395748742e-06, 'alpha': 0.40489300377272486, 'subsample': 0.8129010184958119, 'colsample_bytree': 0.5646173864323861, 'max_depth': 7, 'min_child_weight': 8, 'eta': 0.0009498416196220918, 'gamma': 0.08284491013448105, 'grow_policy': 'lossguide'}. Best is trial 1 with va


Best XGB score: 0.14598857123097292
Best XGB params: {'scale_pos_weight': 5.7541633101571605, 'booster': 'gbtree', 'n_estimators': 1508, 'learning_rate': 0.013383738548413026, 'lambda': 0.33421123289278576, 'alpha': 0.17256599828481023, 'subsample': 0.8879224701646143, 'colsample_bytree': 0.6171764116736405, 'max_depth': 5, 'min_child_weight': 8, 'eta': 0.001463200046312235, 'gamma': 1.3128085869174943e-07, 'grow_policy': 'lossguide'}

Average execution time XGB: 0.66s
Total optimization time XGB: 66.10s

Optimal thresholds (per fold) for best XGB trial: [0.27827638387680054, 0.2740398049354553, 0.27502986788749695, 0.27455195784568787, 0.27784812450408936]
Mean threshold for best XGB trial: 0.275949227809906


## Random Forest

In [18]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization_data_b_1",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X_1, y_1, skf), n_trials=n_trials)  # Requires too much time to train one, reduce the number of trials

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

[I 2025-07-01 20:41:59,389] A new study created in RDB with name: random_forest_optimization_data_b_1
[I 2025-07-01 20:42:01,978] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 247, 'max_depth': 12, 'min_samples_split': 6, 'min_samples_leaf': 8, 'min_weight_fraction_leaf': 0.25334214774255115, 'max_features': 'log2', 'max_samples': 0.4344117628738135, 'max_leaf_nodes': 736, 'class_weight': None}. Best is trial 0 with value: 0.0.
[I 2025-07-01 20:42:10,435] Trial 1 finished with value: 0.16326271373870224 and parameters: {'n_estimators': 672, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 10, 'min_weight_fraction_leaf': 0.477558588917021, 'max_features': 'sqrt', 'max_samples': 0.7512880906423215, 'max_leaf_nodes': 361, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.16326271373870224.
[I 2025-07-01 20:42:17,030] Trial 2 finished with value: 0.17347026123175519 and parameters: {'n_estimators': 318, 'max_depth': 13, 'min_samples_split': 20, 'min


Best RF score: 0.1773281878830109
Best RF params: {'n_estimators': 281, 'max_depth': 20, 'min_samples_split': 16, 'min_samples_leaf': 7, 'min_weight_fraction_leaf': 0.05665906573653677, 'max_features': 'sqrt', 'max_samples': 0.9677676704774092, 'max_leaf_nodes': 91, 'class_weight': 'balanced'}

Average execution time RF: 9.14s
Total optimization time RF: 914.20s


In [19]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization_data_b_2",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X_2, y_2, skf), n_trials=n_trials)  # Requires too much time to train one, reduce the number of trials

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

[I 2025-07-01 20:57:18,205] A new study created in RDB with name: random_forest_optimization_data_b_2
[I 2025-07-01 20:57:28,584] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 700, 'max_depth': 7, 'min_samples_split': 18, 'min_samples_leaf': 4, 'min_weight_fraction_leaf': 0.27910900861803073, 'max_features': 'log2', 'max_samples': 0.8431042778007121, 'max_leaf_nodes': 21, 'class_weight': None}. Best is trial 0 with value: 0.0.
[I 2025-07-01 20:57:31,600] Trial 1 finished with value: 0.12203254155124602 and parameters: {'n_estimators': 122, 'max_depth': 14, 'min_samples_split': 12, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.46583548645497724, 'max_features': 'sqrt', 'max_samples': 0.4345358468604899, 'max_leaf_nodes': 951, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.12203254155124602.
[I 2025-07-01 20:57:35,153] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 229, 'max_depth': 13, 'min_samples_split': 2, 'min_sampl


Best RF score: 0.1383595802972294
Best RF params: {'n_estimators': 273, 'max_depth': 20, 'min_samples_split': 18, 'min_samples_leaf': 6, 'min_weight_fraction_leaf': 0.06614153129513929, 'max_features': 'log2', 'max_samples': 0.34369645260764337, 'max_leaf_nodes': 14, 'class_weight': 'balanced'}

Average execution time RF: 12.16s
Total optimization time RF: 1216.19s


## LightGBM

In [20]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization_data_b_1",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

thresholds = lgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = lgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best LightGBM trial: {thresholds}")
print(f"Mean threshold for best LightGBM trial: {mean_threshold}")

[I 2025-07-01 21:17:40,208] A new study created in RDB with name: lightgbm_optimization_data_b_1
[I 2025-07-01 21:17:47,752] Trial 0 finished with value: 0.16887512845699504 and parameters: {'num_leaves': 153, 'learning_rate': 0.16025422568485953, 'feature_fraction': 0.9446923866778121, 'bagging_fraction': 0.9301626100065288, 'bagging_freq': 2, 'min_child_samples': 62, 'reg_alpha': 0.01792522428806953, 'reg_lambda': 0.9635830385810966, 'scale_pos_weight': 14.636398713135574}. Best is trial 0 with value: 0.16887512845699504.




[I 2025-07-01 21:17:54,845] Trial 1 finished with value: 0.16478741372547578 and parameters: {'num_leaves': 202, 'learning_rate': 0.042581180159776404, 'feature_fraction': 0.6990513851699014, 'bagging_fraction': 0.6341355416864303, 'bagging_freq': 4, 'min_child_samples': 88, 'reg_alpha': 0.4521659696004331, 'reg_lambda': 0.32267536511528006, 'scale_pos_weight': 11.438690579096452}. Best is trial 0 with value: 0.16887512845699504.




[I 2025-07-01 21:18:04,911] Trial 2 finished with value: 0.16346272636440753 and parameters: {'num_leaves': 290, 'learning_rate': 0.2340212762763564, 'feature_fraction': 0.6116032182997887, 'bagging_fraction': 0.6821880417623803, 'bagging_freq': 2, 'min_child_samples': 70, 'reg_alpha': 0.5212308914662303, 'reg_lambda': 0.8108257794614798, 'scale_pos_weight': 8.865170627464416}. Best is trial 0 with value: 0.16887512845699504.




[I 2025-07-01 21:18:13,723] Trial 3 finished with value: 0.16010578457434493 and parameters: {'num_leaves': 179, 'learning_rate': 0.1717391719756504, 'feature_fraction': 0.44909130332720365, 'bagging_fraction': 0.6070447813217354, 'bagging_freq': 6, 'min_child_samples': 77, 'reg_alpha': 0.10396228987392597, 'reg_lambda': 0.4795592960557967, 'scale_pos_weight': 9.083313431724733}. Best is trial 0 with value: 0.16887512845699504.




[I 2025-07-01 21:18:17,129] Trial 4 finished with value: 0.16856246319916832 and parameters: {'num_leaves': 60, 'learning_rate': 0.07275284185676441, 'feature_fraction': 0.6423894328365048, 'bagging_fraction': 0.6195340514868241, 'bagging_freq': 4, 'min_child_samples': 65, 'reg_alpha': 0.5561726500702502, 'reg_lambda': 0.5307854848306438, 'scale_pos_weight': 10.707883221864368}. Best is trial 0 with value: 0.16887512845699504.
[I 2025-07-01 21:18:27,357] Trial 5 finished with value: 0.16569371196039792 and parameters: {'num_leaves': 204, 'learning_rate': 0.019029053699286414, 'feature_fraction': 0.8650587593919105, 'bagging_fraction': 0.5144310874300949, 'bagging_freq': 5, 'min_child_samples': 18, 'reg_alpha': 0.228733131615655, 'reg_lambda': 0.08811924030955576, 'scale_pos_weight': 12.158698301996386}. Best is trial 0 with value: 0.16887512845699504.
[I 2025-07-01 21:18:31,162] Trial 6 finished with value: 0.1601900037178458 and parameters: {'num_leaves': 58, 'learning_rate': 0.023759



[I 2025-07-01 21:18:47,222] Trial 8 finished with value: 0.1652295135087271 and parameters: {'num_leaves': 241, 'learning_rate': 0.05820402028952785, 'feature_fraction': 0.7747058695485092, 'bagging_fraction': 0.8059727206659004, 'bagging_freq': 1, 'min_child_samples': 75, 'reg_alpha': 0.7895871417873727, 'reg_lambda': 0.3097389659889974, 'scale_pos_weight': 8.454181433857794}. Best is trial 0 with value: 0.16887512845699504.
[I 2025-07-01 21:18:57,327] Trial 9 finished with value: 0.16085932706352857 and parameters: {'num_leaves': 181, 'learning_rate': 0.029925097784367465, 'feature_fraction': 0.4337097415710479, 'bagging_fraction': 0.4007395862797126, 'bagging_freq': 7, 'min_child_samples': 8, 'reg_alpha': 0.31036759144459236, 'reg_lambda': 0.14996822010130828, 'scale_pos_weight': 8.482571648065779}. Best is trial 0 with value: 0.16887512845699504.
[I 2025-07-01 21:19:03,963] Trial 10 finished with value: 0.17159874850151144 and parameters: {'num_leaves': 125, 'learning_rate': 0.1317



[I 2025-07-01 21:20:08,251] Trial 25 finished with value: 0.16730370711660286 and parameters: {'num_leaves': 258, 'learning_rate': 0.0681121718955026, 'feature_fraction': 0.8052684179724938, 'bagging_fraction': 0.850154680972166, 'bagging_freq': 3, 'min_child_samples': 46, 'reg_alpha': 0.8586588338653439, 'reg_lambda': 0.9177886925203205, 'scale_pos_weight': 14.747427649073487}. Best is trial 14 with value: 0.17546528551768542.
[I 2025-07-01 21:20:09,814] Trial 26 finished with value: 0.17484773343495597 and parameters: {'num_leaves': 20, 'learning_rate': 0.2021950438295218, 'feature_fraction': 0.9575170554957787, 'bagging_fraction': 0.9209983290621844, 'bagging_freq': 7, 'min_child_samples': 32, 'reg_alpha': 0.73118385286444, 'reg_lambda': 0.9972944375327045, 'scale_pos_weight': 11.582891294180104}. Best is trial 14 with value: 0.17546528551768542.
[I 2025-07-01 21:20:12,851] Trial 27 finished with value: 0.1717808261694244 and parameters: {'num_leaves': 52, 'learning_rate': 0.1979323



[I 2025-07-01 21:21:24,390] Trial 40 finished with value: 0.16859526047315174 and parameters: {'num_leaves': 290, 'learning_rate': 0.07075141495321767, 'feature_fraction': 0.9259649276000795, 'bagging_fraction': 0.6739357201812968, 'bagging_freq': 1, 'min_child_samples': 50, 'reg_alpha': 0.6302695482315922, 'reg_lambda': 0.8315819330467018, 'scale_pos_weight': 15.087763939560277}. Best is trial 14 with value: 0.17546528551768542.
[I 2025-07-01 21:21:25,891] Trial 41 finished with value: 0.17342210354623772 and parameters: {'num_leaves': 21, 'learning_rate': 0.1863790916616237, 'feature_fraction': 0.8887438296756073, 'bagging_fraction': 0.7353758415014641, 'bagging_freq': 7, 'min_child_samples': 20, 'reg_alpha': 0.9332034181815234, 'reg_lambda': 0.8771357878308957, 'scale_pos_weight': 12.680004868507348}. Best is trial 14 with value: 0.17546528551768542.
[I 2025-07-01 21:21:27,456] Trial 42 finished with value: 0.17320416149659096 and parameters: {'num_leaves': 20, 'learning_rate': 0.21



[I 2025-07-01 21:22:28,730] Trial 59 finished with value: 0.16774549498598285 and parameters: {'num_leaves': 125, 'learning_rate': 0.017640044500806727, 'feature_fraction': 0.8561326780644013, 'bagging_fraction': 0.40624220026524444, 'bagging_freq': 3, 'min_child_samples': 50, 'reg_alpha': 0.6955139946199159, 'reg_lambda': 0.9646084698265749, 'scale_pos_weight': 15.810850425415552}. Best is trial 52 with value: 0.1757823677256237.
[I 2025-07-01 21:22:32,610] Trial 60 finished with value: 0.17358604457292887 and parameters: {'num_leaves': 72, 'learning_rate': 0.09370544481036705, 'feature_fraction': 0.9423373706792851, 'bagging_fraction': 0.9813543288546777, 'bagging_freq': 2, 'min_child_samples': 59, 'reg_alpha': 0.8612885029453956, 'reg_lambda': 0.5672368276023139, 'scale_pos_weight': 14.931137020580524}. Best is trial 52 with value: 0.1757823677256237.
[I 2025-07-01 21:22:34,927] Trial 61 finished with value: 0.17482934821629742 and parameters: {'num_leaves': 33, 'learning_rate': 0.1



[I 2025-07-01 21:24:02,624] Trial 86 finished with value: 0.16471203132333065 and parameters: {'num_leaves': 264, 'learning_rate': 0.16455261980066846, 'feature_fraction': 0.5744747351714363, 'bagging_fraction': 0.8101037837135364, 'bagging_freq': 4, 'min_child_samples': 62, 'reg_alpha': 0.8994330554291693, 'reg_lambda': 0.6911699361449443, 'scale_pos_weight': 15.289408950012513}. Best is trial 83 with value: 0.17673974856872343.
[I 2025-07-01 21:24:05,752] Trial 87 finished with value: 0.17187839615921088 and parameters: {'num_leaves': 47, 'learning_rate': 0.11173433127952521, 'feature_fraction': 0.8873813869959777, 'bagging_fraction': 0.6945940023512105, 'bagging_freq': 5, 'min_child_samples': 49, 'reg_alpha': 0.7619216960390216, 'reg_lambda': 0.8698777137399959, 'scale_pos_weight': 14.887841734015444}. Best is trial 83 with value: 0.17673974856872343.
[I 2025-07-01 21:24:09,528] Trial 88 finished with value: 0.16048464487646516 and parameters: {'num_leaves': 61, 'learning_rate': 0.1


Best LightGBM score: 0.17673974856872343
Best LightGBM params: {'num_leaves': 25, 'learning_rate': 0.0967511128255157, 'feature_fraction': 0.7122555628392468, 'bagging_fraction': 0.833094723818265, 'bagging_freq': 4, 'min_child_samples': 44, 'reg_alpha': 0.9146243460231572, 'reg_lambda': 0.8623462257030498, 'scale_pos_weight': 14.777132923677819}

Average execution time LightGBM: 4.13s
Total optimization time LightGBM: 412.71s

Optimal thresholds (per fold) for best LightGBM trial: [0.14744296937080806, 0.14971743223225073, 0.15446339470960424, 0.15840697970650036, 0.14676918968320696]
Mean threshold for best LightGBM trial: 0.15135999314047408


In [21]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization_data_b_2",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

thresholds = lgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = lgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best LightGBM trial: {thresholds}")
print(f"Mean threshold for best LightGBM trial: {mean_threshold}")

[I 2025-07-01 21:24:38,089] A new study created in RDB with name: lightgbm_optimization_data_b_2
[I 2025-07-01 21:24:50,969] Trial 0 finished with value: 0.11991132081231752 and parameters: {'num_leaves': 273, 'learning_rate': 0.07611100584819296, 'feature_fraction': 0.8592798863066252, 'bagging_fraction': 0.6582527116599557, 'bagging_freq': 7, 'min_child_samples': 12, 'reg_alpha': 0.7298576015408886, 'reg_lambda': 0.12843308481274607, 'scale_pos_weight': 22.403728564820476}. Best is trial 0 with value: 0.11991132081231752.
[I 2025-07-01 21:24:58,922] Trial 1 finished with value: 0.12171938200330538 and parameters: {'num_leaves': 159, 'learning_rate': 0.03676120194318135, 'feature_fraction': 0.8258898707359733, 'bagging_fraction': 0.6158267294930256, 'bagging_freq': 2, 'min_child_samples': 10, 'reg_alpha': 0.3853478656064081, 'reg_lambda': 0.23958633493004478, 'scale_pos_weight': 17.858127356230714}. Best is trial 1 with value: 0.12171938200330538.
[I 2025-07-01 21:25:14,661] Trial 2 f



[I 2025-07-01 21:25:28,712] Trial 4 finished with value: 0.11956579540531831 and parameters: {'num_leaves': 230, 'learning_rate': 0.07041547632872262, 'feature_fraction': 0.7345917685502976, 'bagging_fraction': 0.5484207295901081, 'bagging_freq': 6, 'min_child_samples': 74, 'reg_alpha': 0.7834055261698892, 'reg_lambda': 0.3820662616691244, 'scale_pos_weight': 20.62404521708641}. Best is trial 3 with value: 0.13104441281112736.




[I 2025-07-01 21:25:37,679] Trial 5 finished with value: 0.12506034371495803 and parameters: {'num_leaves': 212, 'learning_rate': 0.1102923483156944, 'feature_fraction': 0.7272159572110598, 'bagging_fraction': 0.6792003011140132, 'bagging_freq': 3, 'min_child_samples': 22, 'reg_alpha': 0.7451713596013888, 'reg_lambda': 0.928567942336484, 'scale_pos_weight': 15.37325896089633}. Best is trial 3 with value: 0.13104441281112736.




[I 2025-07-01 21:25:43,603] Trial 6 finished with value: 0.10864728161561694 and parameters: {'num_leaves': 281, 'learning_rate': 0.04153991875762169, 'feature_fraction': 0.5111787945745373, 'bagging_fraction': 0.5416505289604088, 'bagging_freq': 7, 'min_child_samples': 84, 'reg_alpha': 0.5053213913809415, 'reg_lambda': 0.011792104578797535, 'scale_pos_weight': 12.86606308784254}. Best is trial 3 with value: 0.13104441281112736.




[I 2025-07-01 21:26:03,003] Trial 7 finished with value: 0.11828999761211353 and parameters: {'num_leaves': 162, 'learning_rate': 0.22925544809073295, 'feature_fraction': 0.43598875087379807, 'bagging_fraction': 0.6372628000528517, 'bagging_freq': 4, 'min_child_samples': 81, 'reg_alpha': 0.4691468273130994, 'reg_lambda': 0.2016548730201041, 'scale_pos_weight': 24.09115161897749}. Best is trial 3 with value: 0.13104441281112736.
[I 2025-07-01 21:26:18,896] Trial 8 finished with value: 0.12173234692567803 and parameters: {'num_leaves': 190, 'learning_rate': 0.2525244471231641, 'feature_fraction': 0.6229758067940588, 'bagging_fraction': 0.7029947113270015, 'bagging_freq': 1, 'min_child_samples': 16, 'reg_alpha': 0.4312443010696838, 'reg_lambda': 0.6586574642181976, 'scale_pos_weight': 17.68899364801786}. Best is trial 3 with value: 0.13104441281112736.
[I 2025-07-01 21:26:25,028] Trial 9 finished with value: 0.12280595865698674 and parameters: {'num_leaves': 182, 'learning_rate': 0.093175



[I 2025-07-01 21:29:00,601] Trial 56 finished with value: 0.12293782761288512 and parameters: {'num_leaves': 225, 'learning_rate': 0.013268537180922918, 'feature_fraction': 0.7451541698863409, 'bagging_fraction': 0.7968265312731944, 'bagging_freq': 4, 'min_child_samples': 60, 'reg_alpha': 0.10069468528270613, 'reg_lambda': 0.9649312485373172, 'scale_pos_weight': 14.604344043473265}. Best is trial 11 with value: 0.13666231306792492.
[I 2025-07-01 21:29:02,298] Trial 57 finished with value: 0.1346240380990849 and parameters: {'num_leaves': 33, 'learning_rate': 0.011570192017396924, 'feature_fraction': 0.9480635203922844, 'bagging_fraction': 0.9004167646345952, 'bagging_freq': 1, 'min_child_samples': 67, 'reg_alpha': 0.18193607694504213, 'reg_lambda': 0.8445919903949002, 'scale_pos_weight': 14.012702973454221}. Best is trial 11 with value: 0.13666231306792492.
[I 2025-07-01 21:29:05,074] Trial 58 finished with value: 0.12702086684822816 and parameters: {'num_leaves': 67, 'learning_rate': 



[I 2025-07-01 21:30:45,271] Trial 86 finished with value: 0.12405327462904989 and parameters: {'num_leaves': 262, 'learning_rate': 0.016724909849687476, 'feature_fraction': 0.836460865782067, 'bagging_fraction': 0.8107454314038882, 'bagging_freq': 2, 'min_child_samples': 65, 'reg_alpha': 0.04728487251380885, 'reg_lambda': 0.7989312073876224, 'scale_pos_weight': 13.423013949327864}. Best is trial 62 with value: 0.13752318958037033.
[I 2025-07-01 21:30:55,359] Trial 87 finished with value: 0.12434327301464879 and parameters: {'num_leaves': 199, 'learning_rate': 0.013481912173883406, 'feature_fraction': 0.9882484901478362, 'bagging_fraction': 0.8741362165005042, 'bagging_freq': 1, 'min_child_samples': 58, 'reg_alpha': 0.07568379826001584, 'reg_lambda': 0.7443934045952914, 'scale_pos_weight': 11.772158021918475}. Best is trial 62 with value: 0.13752318958037033.
[I 2025-07-01 21:30:57,749] Trial 88 finished with value: 0.13307784459166896 and parameters: {'num_leaves': 37, 'learning_rate':


Best LightGBM score: 0.13752318958037033
Best LightGBM params: {'num_leaves': 41, 'learning_rate': 0.011641235442562662, 'feature_fraction': 0.9998560404167512, 'bagging_fraction': 0.8363450781508417, 'bagging_freq': 1, 'min_child_samples': 64, 'reg_alpha': 0.17784386063180405, 'reg_lambda': 0.9895633511400649, 'scale_pos_weight': 12.704247420253942}

Average execution time LightGBM: 4.10s
Total optimization time LightGBM: 409.79s

Optimal thresholds (per fold) for best LightGBM trial: [0.06939455289415998, 0.061164487048838055, 0.06752951676658832, 0.06695679756822763, 0.06825439970313163]
Mean threshold for best LightGBM trial: 0.06665995079618912


## Catboost

In [22]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization_data_b_1",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X_1, y_1, skf, ratio_negative_to_positive_b_1), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

thresholds = cat_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = cat_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best CatBoost trial: {thresholds}")
print(f"Mean threshold for best CatBoost trial: {mean_threshold}")

[I 2025-07-01 21:31:32,612] A new study created in RDB with name: catboost_optimization_data_b_1
[I 2025-07-01 21:31:35,237] Trial 0 finished with value: 0.1725234604689935 and parameters: {'iterations': 671, 'learning_rate': 0.011110037217831858, 'depth': 6, 'l2_leaf_reg': 3.0772958269708095, 'bagging_temperature': 0.4103401653524479, 'random_strength': 3.1292384751671243, 'scale_pos_weight': 13.412225077455936}. Best is trial 0 with value: 0.1725234604689935.
[I 2025-07-01 21:31:41,167] Trial 1 finished with value: 0.16699348384079207 and parameters: {'iterations': 977, 'learning_rate': 0.012654261243366098, 'depth': 9, 'l2_leaf_reg': 8.6980872512937, 'bagging_temperature': 0.44530662629069995, 'random_strength': 7.9535645290687516, 'scale_pos_weight': 15.572994323519207}. Best is trial 0 with value: 0.1725234604689935.
[I 2025-07-01 21:31:45,406] Trial 2 finished with value: 0.17022022110295243 and parameters: {'iterations': 766, 'learning_rate': 0.021073002192104964, 'depth': 7, 'l


Best CatBoost score: 0.1836140450682995
Best CatBoost params: {'iterations': 556, 'learning_rate': 0.07987462623407432, 'depth': 5, 'l2_leaf_reg': 4.510112268644413, 'bagging_temperature': 0.3707876474615084, 'random_strength': 0.9974394579850405, 'scale_pos_weight': 11.182710282478563}

Average execution time CatBoost: 2.14s
Total optimization time CatBoost: 214.50s

Optimal thresholds (per fold) for best CatBoost trial: [0.5006713777645523, 0.5137569362288218, 0.5001157523605751, 0.5051561027544451, 0.5252722080709875]
Mean threshold for best CatBoost trial: 0.5089944754358763


In [23]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization_data_b_2",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X_2, y_2, skf, ratio_negative_to_positive_b_2), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

thresholds = cat_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = cat_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best CatBoost trial: {thresholds}")
print(f"Mean threshold for best CatBoost trial: {mean_threshold}")

[I 2025-07-01 21:35:10,522] A new study created in RDB with name: catboost_optimization_data_b_2
[I 2025-07-01 21:35:14,080] Trial 0 finished with value: 0.1371307111692157 and parameters: {'iterations': 469, 'learning_rate': 0.015408099690192799, 'depth': 8, 'l2_leaf_reg': 7.2587046709186716, 'bagging_temperature': 0.5459819304335254, 'random_strength': 8.119410345678741, 'scale_pos_weight': 22.165366537969383}. Best is trial 0 with value: 0.1371307111692157.
[I 2025-07-01 21:35:15,315] Trial 1 finished with value: 0.13396503540356336 and parameters: {'iterations': 934, 'learning_rate': 0.17151045898734252, 'depth': 4, 'l2_leaf_reg': 7.440088258254164, 'bagging_temperature': 0.6162901723341108, 'random_strength': 9.84545076035085, 'scale_pos_weight': 17.540168455366636}. Best is trial 0 with value: 0.1371307111692157.
[I 2025-07-01 21:35:17,060] Trial 2 finished with value: 0.1387469608593899 and parameters: {'iterations': 840, 'learning_rate': 0.01717709769223529, 'depth': 4, 'l2_lea


Best CatBoost score: 0.14768194898835646
Best CatBoost params: {'iterations': 813, 'learning_rate': 0.06235177583991131, 'depth': 6, 'l2_leaf_reg': 9.966596575819873, 'bagging_temperature': 0.9162366459959626, 'random_strength': 0.014186748275358041, 'scale_pos_weight': 15.525408600444193}

Average execution time CatBoost: 1.65s
Total optimization time CatBoost: 164.99s

Optimal thresholds (per fold) for best CatBoost trial: [0.5052702470453135, 0.5006558775551604, 0.516338341527177, 0.5106614449535009, 0.5037805250633571]
Mean threshold for best CatBoost trial: 0.5073412872289018


## HistGradientBoosting

In [28]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization_data_b_1",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X_1, y_1, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

thresholds = histgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best HistGB trial: {thresholds}")
print(f"Mean threshold for best HistGB trial: {mean_threshold}")

[I 2025-07-01 23:09:48,128] A new study created in RDB with name: histgb_optimization_data_b_1


[I 2025-07-01 23:09:49,582] Trial 0 finished with value: 0.0 and parameters: {'max_iter': 144, 'learning_rate': 0.02628575686034557, 'max_depth': 3, 'min_samples_leaf': 32, 'l2_regularization': 0.24238514247779097, 'max_bins': 232}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:09:50,578] Trial 1 finished with value: 0.0 and parameters: {'max_iter': 457, 'learning_rate': 0.06098665243149286, 'max_depth': 4, 'min_samples_leaf': 95, 'l2_regularization': 0.9277376299415382, 'max_bins': 187}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:09:52,151] Trial 2 finished with value: 0.0 and parameters: {'max_iter': 187, 'learning_rate': 0.07250556852353457, 'max_depth': 7, 'min_samples_leaf': 75, 'l2_regularization': 0.779460465095678, 'max_bins': 149}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:09:54,613] Trial 3 finished with value: 0.0 and parameters: {'max_iter': 119, 'learning_rate': 0.033968841553651284, 'max_depth': 6, 'min_samples_leaf': 18, 'l2_regularization': 0.85586209


Best HistGB score: 0.0
Best HistGB params: {'max_iter': 144, 'learning_rate': 0.02628575686034557, 'max_depth': 3, 'min_samples_leaf': 32, 'l2_regularization': 0.24238514247779097, 'max_bins': 232}

Average execution time HistGB: 1.29s
Total optimization time HistGB: 129.07s

Optimal thresholds (per fold) for best HistGB trial: []
Mean threshold for best HistGB trial: None


In [29]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization_data_b_2",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X_2, y_2, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

thresholds = histgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best HistGB trial: {thresholds}")
print(f"Mean threshold for best HistGB trial: {mean_threshold}")

[I 2025-07-01 23:12:00,007] A new study created in RDB with name: histgb_optimization_data_b_2
[I 2025-07-01 23:12:02,590] Trial 0 finished with value: 0.0 and parameters: {'max_iter': 105, 'learning_rate': 0.029474936479619022, 'max_depth': 8, 'min_samples_leaf': 43, 'l2_regularization': 0.5683099863912949, 'max_bins': 58}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:12:03,616] Trial 1 finished with value: 0.0 and parameters: {'max_iter': 339, 'learning_rate': 0.15937454526999043, 'max_depth': 9, 'min_samples_leaf': 49, 'l2_regularization': 0.10932777824854989, 'max_bins': 43}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:12:04,490] Trial 2 finished with value: 0.0 and parameters: {'max_iter': 355, 'learning_rate': 0.2404851922747016, 'max_depth': 10, 'min_samples_leaf': 85, 'l2_regularization': 0.3153247269310062, 'max_bins': 127}. Best is trial 0 with value: 0.0.
[I 2025-07-01 23:12:05,655] Trial 3 finished with value: 0.0 and parameters: {'max_iter': 251, 'learning_rate'


Best HistGB score: 0.0008368200836820083
Best HistGB params: {'max_iter': 402, 'learning_rate': 0.26967888214146024, 'max_depth': 4, 'min_samples_leaf': 21, 'l2_regularization': 0.04634693348665375, 'max_bins': 243}

Average execution time HistGB: 1.14s
Total optimization time HistGB: 114.42s

Optimal thresholds (per fold) for best HistGB trial: []
Mean threshold for best HistGB trial: None


# Conclusion

In [30]:
xgb_study = optuna.load_study(
    study_name="xgboost_optimization_data_b_1",
    storage=db_dir.format('xgb_study')
)
xgb_study_b2 = optuna.load_study(
    study_name="xgboost_optimization_data_b_2",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization_data_b_1",
    storage=db_dir.format('rf_study')
)
rf_study_b2 = optuna.load_study(
    study_name="random_forest_optimization_data_b_2",
    storage=db_dir.format('rf_study')
)
lgb_study = optuna.load_study(
    study_name="lightgbm_optimization_data_b_1",
    storage=db_dir.format('lgb_study')
)
lgb_study_b2 = optuna.load_study(
    study_name="lightgbm_optimization_data_b_2",
    storage=db_dir.format('lgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization_data_b_1",
    storage=db_dir.format('cat_study')
)
cat_study_b2 = optuna.load_study(
    study_name="catboost_optimization_data_b_2",
    storage=db_dir.format('cat_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization_data_b_1",
    storage=db_dir.format('histgb_study')
)
histgb_study_b2 = optuna.load_study(
    study_name="histgb_optimization_data_b_2",
    storage=db_dir.format('histgb_study')
)


In [31]:
print("XGBoost data_b_1 best score:", xgb_study.best_value)
print("XGBoost data_b_2 best score:", xgb_study_b2.best_value)

print("Random Forest data_b_1 best score:", rf_study.best_value)
print("Random Forest data_b_2 best score:", rf_study_b2.best_value)

print("LightGBM data_b_1 best score:", lgb_study.best_value)
print("LightGBM data_b_2 best score:", lgb_study_b2.best_value)

print("CatBoost data_b_1 best score:", cat_study.best_value)
print("CatBoost data_b_2 best score:", cat_study_b2.best_value)

print("HistGB data_b_1 best score:", histgb_study.best_value)
print("HistGB data_b_2 best score:", histgb_study_b2.best_value)

XGBoost data_b_1 best score: 0.18482948619990122
XGBoost data_b_2 best score: 0.14598857123097292
Random Forest data_b_1 best score: 0.1773281878830109
Random Forest data_b_2 best score: 0.1383595802972294
LightGBM data_b_1 best score: 0.17673974856872343
LightGBM data_b_2 best score: 0.13752318958037033
CatBoost data_b_1 best score: 0.1836140450682995
CatBoost data_b_2 best score: 0.14768194898835646
HistGB data_b_1 best score: 0.0
HistGB data_b_2 best score: 0.0008368200836820083
