# Imports and definitions

# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Load data](#load-data)
3. [Prepare data](#prepare-data)
4. [Define objectives](#define-objectives)
5. [Start tuning](#start-tuning)
6. [Conclusion](#conclusion)

---

In [None]:
from pathlib import Path
import time

import polars as pl
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, f1_score

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
db_dir = 'sqlite:///data/models/{}.db'

# Load data

In [4]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 12.7 ms, sys: 8.67 ms, total: 21.4 ms
Wall time: 29.7 ms


# Prepare data

In [5]:
X = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = train.select('has_done_upselling')


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define objectives

In [6]:
# Compute the ratio of negative to positive instances in the target
ratio_negative_to_positive = (
    (y['has_done_upselling'] == False).sum() / (y['has_done_upselling'] == True).sum()
)
print("ratio_negative_to_positive:", ratio_negative_to_positive)

ratio_negative_to_positive: 13.186912573151268


In [7]:
def find_optimal_f1(valid_y, preds):
    # Find optimal threshold for F1
    precision, recall, thresholds = precision_recall_curve(valid_y, preds)
    f1_scores_thresh = 2 * (precision * recall) / (precision + recall + 1e-8)
    optimal_idx = np.argmax(f1_scores_thresh)
    optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
    
    # Make binary predictions using optimal threshold
    pred_labels = (preds >= optimal_threshold).astype(int)
    return f1_score(valid_y, pred_labels), optimal_threshold

In [None]:
def xgboost_objective(trial, X, y, skf, n_splits=5):
    '''
    XGBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    param = {
        'verbosity': 0,
        'n_jobs': 4,
        'early_stopping_rounds': 16,
        'eval_metric': 'aucpr',
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", 1.0, ratio_negative_to_positive * 1.5, log=True),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),

        # L2 regularization weight.
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        # L1 regularization weight.
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        # sampling according to each tree.
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
    }

    if param['booster'] in ['gbtree', 'dart']:
        # maximum depth of the tree, signifies complexity of the tree.
        param['max_depth'] = trial.suggest_int('max_depth', 3, 20)
        # minimum child weight, larger the term more conservative the tree.
        param['min_child_weight'] = trial.suggest_int('min_child_weight', 2, 10)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)
    
    start_time = time.time()
    
    # Return mean F1 score across all folds
    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()

        # Create DMatrix objects
        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        
        # Train model
        bst = xgb.train(param, dtrain)
        
        # Make predictions
        preds = bst.predict(dvalid)
        
        f1, optimal_threshold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(float(optimal_threshold))

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', float(np.mean(optimal_thresholds)))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [9]:
def random_forest_objective(trial, X, y, skf, n_splits=5):
    '''
    Random Forest objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []
    
    # Random Forest hyperparameters
    param = {
        'n_jobs': 4,
        'random_state': 42,
        'verbose': 0,
        
        # Core tree parameters
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        
        # Feature sampling parameters
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
        
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000),
        
        # Class balancing
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
    }
    
    # Create Random Forest classifier
    rf = RandomForestClassifier(**param)
    
    start_time = time.time()
    
    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Train model
        rf.fit(train_x, train_y)
        
        # Make probability predictions
        preds = rf.predict_proba(valid_x)[:, 1]  # Get probability of positive class
        
        optimal_threhsold, f1 = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [None]:
def histgb_objective(trial, X, y, skf, n_splits=5):
    '''
    HistGradientBoostingClassifier objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # HistGradientBoosting hyperparameters (simplified)
    param = {
        'random_state': 42,
        'verbose': 0,
        
        # Core boosting parameters
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100),
        
        # Regularization
        'l2_regularization': trial.suggest_float('l2_regularization', 0.0, 1.0),
        'max_bins': trial.suggest_int('max_bins', 32, 255),
        
        # Early stopping
        'early_stopping': True,
        'n_iter_no_change': 10,
        'validation_fraction': 0.1,
    }
    
    # Create HistGradientBoosting classifier
    hgb = HistGradientBoostingClassifier(**param)
    
    start_time = time.time()
    
    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Train model
        hgb.fit(train_x, train_y)
        
        # Make probability predictions
        preds = hgb.predict_proba(valid_x)[:, 1]  # Get probability of positive class
        
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))
    # Return mean F1 score across all folds
    return np.mean(cv_scores)


In [11]:
def lightgbm_objective(trial, X, y, skf, n_splits=5):
    '''
    LightGBM objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # LightGBM hyperparameters
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': 0,
        'seed': 42,
        'num_threads': 4,
        'deterministic': True,
        
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),

    }
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(train_x, label=train_y)
        valid_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data)
        
        # Train model with early stopping
        model = lgb.train(
            param,
            train_data,
            valid_sets=[valid_data],
            num_boost_round=1000,
            callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)]
        )
        
        # Make probability predictions
        preds = model.predict(valid_x, num_iteration=model.best_iteration)
        
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))
    
    # Return mean F1 score across all folds
    return np.mean(cv_scores)

In [12]:
def catboost_objective(trial, X, y, skf, n_splits=5):
    '''
    CatBoost objective function using stratified cross-validation
    
    Args:
        trial: Optuna trial object
        X: Feature matrix
        y: Target vector
        skf: Stratified K-Fold cross-validator
        n_splits: Number of folds for cross-validation (default: 5)
    '''
    
    cv_scores = []
    optimal_thresholds = []

    # CatBoost hyperparameters (simplified)
    param = {
        'random_seed': 42,
        'verbose': False,
        'allow_writing_files': False,
        'thread_count': 4,
        
        # Core boosting parameters
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        
        # Regularization and overfitting control
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
        'scale_pos_weight': trial.suggest_float("scale_pos_weight", ratio_negative_to_positive * 0.7, ratio_negative_to_positive * 1.5, log=True),
        
        # Early stopping
        'early_stopping_rounds': 50,
        'eval_metric': 'F1',
    }
    
    # Create CatBoost classifier
    cb = CatBoostClassifier(**param)
    
    start_time = time.time()

    # Perform stratified cross-validation
    for train_idx, valid_idx in skf.split(X, y):
        # Split data for current fold
        train_x = X[train_idx].to_numpy()
        valid_x = X[valid_idx].to_numpy()
        train_y = y[train_idx].to_numpy().ravel()
        valid_y = y[valid_idx].to_numpy().ravel()
        
        # Train model with validation set for early stopping
        cb.fit(
            train_x, train_y,
            eval_set=(valid_x, valid_y),
            verbose=False
        )
        
        # Make probability predictions
        preds = cb.predict_proba(valid_x)[:, 1]  # Get probability of positive class
    
        f1, optimal_threhsold = find_optimal_f1(valid_y, preds)
        
        cv_scores.append(f1)
        optimal_thresholds.append(optimal_threhsold)

    execution_time = time.time() - start_time
    
    trial.set_user_attr('execution_time', execution_time)
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    trial.set_user_attr('threshold', np.mean(optimal_thresholds))

    # Return mean F1 score across all folds
    return np.mean(cv_scores)

# Start tuning

In [None]:
# Setup
n_trials = 300

In [14]:
# XGBoost study
xgb_study = optuna.create_study(
    study_name="xgboost_optimization",
    direction="maximize",
    storage=db_dir.format('xgb_study'),
    load_if_exists=True
)
xgb_study.optimize(lambda trial: xgboost_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest XGB score: {xgb_study.best_value}")
print(f"Best XGB params: {xgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]
print(f"\nAverage execution time XGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time XGB: {sum(execution_times):.2f}s")

thresholds = xgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = xgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best XGB trial: {thresholds}")
print(f"Mean threshold for best XGB trial: {mean_threshold}")

[I 2025-07-01 12:26:20,253] A new study created in RDB with name: xgboost_optimization
[I 2025-07-01 12:26:21,943] Trial 0 finished with value: 0.1488694873073217 and parameters: {'scale_pos_weight': 11.229784196636132, 'booster': 'gbtree', 'n_estimators': 451, 'learning_rate': 0.004952918423745165, 'lambda': 0.011167118083157268, 'alpha': 0.00017434635977835942, 'subsample': 0.2518447923260811, 'colsample_bytree': 0.5736026425635383, 'max_depth': 11, 'min_child_weight': 7, 'eta': 0.00033215453038717334, 'gamma': 0.05121183696418969, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.1488694873073217.
[I 2025-07-01 12:26:23,450] Trial 1 finished with value: 0.13869514523736998 and parameters: {'scale_pos_weight': 12.562429002370926, 'booster': 'dart', 'n_estimators': 1941, 'learning_rate': 0.002908075744207783, 'lambda': 0.6708003330524983, 'alpha': 4.194932905730385e-08, 'subsample': 0.542352616394949, 'colsample_bytree': 0.3126386249602094, 'max_depth': 15, 'min_child_weight'


Best XGB score: 0.17084744962560208
Best XGB params: {'scale_pos_weight': 17.83349543431033, 'booster': 'gbtree', 'n_estimators': 747, 'learning_rate': 0.21471070011185667, 'lambda': 0.41678497077054427, 'alpha': 1.4481041360747245e-08, 'subsample': 0.9313989049125304, 'colsample_bytree': 0.9555845404311799, 'max_depth': 3, 'min_child_weight': 9, 'eta': 0.00898977016906825, 'gamma': 1.1705244964682773e-07, 'grow_policy': 'depthwise'}

Average execution time XGB: 0.53s
Total optimization time XGB: 157.81s

Optimal thresholds (per fold) for best XGB trial: [0.6141753196716309, 0.6047128438949585, 0.5944847464561462, 0.6122851371765137, 0.6036597490310669]
Mean threshold for best XGB trial: 0.6058635592460633


In [16]:
# Random Forest study
rf_study = optuna.create_study(
    study_name="random_forest_optimization",
    direction="maximize",
    storage=db_dir.format('rf_study'),
    load_if_exists=True
)
rf_study.optimize(lambda trial: random_forest_objective(trial, X, y, skf), n_trials=n_trials * 0.3)  # Requires too much time to train one, reduce the number of trials

print(f"\nBest RF score: {rf_study.best_value}")
print(f"Best RF params: {rf_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in rf_study.trials]
print(f"\nAverage execution time RF: {np.mean(execution_times):.2f}s")
print(f"Total optimization time RF: {sum(execution_times):.2f}s")

thresholds = rf_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = rf_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best RF trial: {thresholds}")
print(f"Mean threshold for best RF trial: {mean_threshold}")

[I 2025-07-01 13:11:32,478] Using an existing study with name 'random_forest_optimization' instead of creating a new one.
[I 2025-07-01 13:11:41,235] Trial 1 finished with value: 0.5118064172384977 and parameters: {'n_estimators': 146, 'max_depth': 5, 'min_samples_split': 18, 'min_samples_leaf': 9, 'min_weight_fraction_leaf': 0.08179119588825795, 'max_features': 'sqrt', 'max_samples': 0.813703384580415, 'max_leaf_nodes': 884, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.5118064172384977.
[I 2025-07-01 13:11:46,762] Trial 2 finished with value: 0.50935451002572 and parameters: {'n_estimators': 249, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 10, 'min_weight_fraction_leaf': 0.08173748624195076, 'max_features': 'log2', 'max_samples': 0.2786155534293667, 'max_leaf_nodes': 518, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.5118064172384977.
[I 2025-07-01 13:12:01,172] Trial 3 finished with value: 0.07183990700941165 and parameters: {'n_es


Best RF score: 0.5237055765942193
Best RF params: {'n_estimators': 762, 'max_depth': 16, 'min_samples_split': 16, 'min_samples_leaf': 5, 'min_weight_fraction_leaf': 0.03592884702326953, 'max_features': 'sqrt', 'max_samples': 0.9648564911153312, 'max_leaf_nodes': 978, 'class_weight': 'balanced'}

Average execution time RF: 25.24s
Total optimization time RF: 2296.39s

Optimal thresholds (per fold) for best RF trial: [0.16935195130683853, 0.17213463996591394, 0.17247222745245716, 0.16337769619091327, 0.1719327041444399]
Mean threshold for best RF trial: 0.16985384381211258


In [17]:
# HistGradientBoosting study
histgb_study = optuna.create_study(
    study_name="histgb_optimization",
    direction="maximize",
    storage=db_dir.format('histgb_study'),
    load_if_exists=True
)
histgb_study.optimize(lambda trial: histgb_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest HistGB score: {histgb_study.best_value}")
print(f"Best HistGB params: {histgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]
print(f"\nAverage execution time HistGB: {np.mean(execution_times):.2f}s")
print(f"Total optimization time HistGB: {sum(execution_times):.2f}s")

thresholds = histgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best HistGB trial: {thresholds}")
print(f"Mean threshold for best HistGB trial: {mean_threshold}")

[I 2025-07-01 13:49:53,062] A new study created in RDB with name: histgb_optimization
[I 2025-07-01 13:49:56,143] Trial 0 finished with value: 0.1680047130135688 and parameters: {'max_iter': 156, 'learning_rate': 0.09445854228022277, 'max_depth': 4, 'min_samples_leaf': 16, 'l2_regularization': 0.012865784213263454, 'max_bins': 65}. Best is trial 0 with value: 0.1680047130135688.
[I 2025-07-01 13:50:14,541] Trial 1 finished with value: 0.1664460511728095 and parameters: {'max_iter': 373, 'learning_rate': 0.01650717705130269, 'max_depth': 8, 'min_samples_leaf': 57, 'l2_regularization': 0.3308499813627913, 'max_bins': 88}. Best is trial 0 with value: 0.1680047130135688.
[I 2025-07-01 13:50:16,253] Trial 2 finished with value: 0.16525156603433785 and parameters: {'max_iter': 403, 'learning_rate': 0.25606127195501566, 'max_depth': 4, 'min_samples_leaf': 85, 'l2_regularization': 0.7001693874600848, 'max_bins': 92}. Best is trial 0 with value: 0.1680047130135688.
[I 2025-07-01 13:50:21,725] T


Best HistGB score: 0.1710877957692138
Best HistGB params: {'max_iter': 122, 'learning_rate': 0.011946188388124244, 'max_depth': 3, 'min_samples_leaf': 88, 'l2_regularization': 0.4294702816204709, 'max_bins': 231}

Average execution time HistGB: 6.02s
Total optimization time HistGB: 1806.09s

Optimal thresholds (per fold) for best HistGB trial: [0.0778369675525772, 0.08058412057141506, 0.07552058122068679, 0.08002428391277032, 0.0807247614465993]
Mean threshold for best HistGB trial: 0.07893814294080972


In [18]:
# LightGBM study
lgb_study = optuna.create_study(
    study_name="lightgbm_optimization",
    direction="maximize",
    storage=db_dir.format('lgb_study'),
    load_if_exists=True
)
lgb_study.optimize(lambda trial: lightgbm_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest LightGBM score: {lgb_study.best_value}")
print(f"Best LightGBM params: {lgb_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]
print(f"\nAverage execution time LightGBM: {np.mean(execution_times):.2f}s")
print(f"Total optimization time LightGBM: {sum(execution_times):.2f}s")

thresholds = lgb_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = lgb_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best LightGBM trial: {thresholds}")
print(f"Mean threshold for best LightGBM trial: {mean_threshold}")

[I 2025-07-01 14:20:09,240] A new study created in RDB with name: lightgbm_optimization
[I 2025-07-01 14:20:11,899] Trial 0 finished with value: 0.13809650971835175 and parameters: {'num_leaves': 61, 'learning_rate': 0.2226968180540562, 'feature_fraction': 0.5668761148520542, 'bagging_fraction': 0.6381232030019665, 'bagging_freq': 7, 'min_child_samples': 65, 'reg_alpha': 0.3877490888786743, 'reg_lambda': 0.015518907516295699, 'scale_pos_weight': 12.217698235807323}. Best is trial 0 with value: 0.13809650971835175.
[I 2025-07-01 14:20:19,974] Trial 1 finished with value: 0.14782945544777817 and parameters: {'num_leaves': 276, 'learning_rate': 0.053793466485572934, 'feature_fraction': 0.8218528862870018, 'bagging_fraction': 0.8266402771649976, 'bagging_freq': 6, 'min_child_samples': 6, 'reg_alpha': 0.7751259624399588, 'reg_lambda': 0.9318850267990917, 'scale_pos_weight': 12.045023620557613}. Best is trial 1 with value: 0.14782945544777817.
[I 2025-07-01 14:20:26,867] Trial 2 finished wit



[I 2025-07-01 14:25:22,008] Trial 100 finished with value: 0.1494687299832107 and parameters: {'num_leaves': 235, 'learning_rate': 0.14020848016186835, 'feature_fraction': 0.9889593245161299, 'bagging_fraction': 0.647340254195768, 'bagging_freq': 3, 'min_child_samples': 92, 'reg_alpha': 0.365198038967475, 'reg_lambda': 0.8414391660090867, 'scale_pos_weight': 19.740872964956395}. Best is trial 74 with value: 0.16419194543061777.
[I 2025-07-01 14:25:23,829] Trial 101 finished with value: 0.16383871679411927 and parameters: {'num_leaves': 32, 'learning_rate': 0.07353654896764829, 'feature_fraction': 0.9149052535866148, 'bagging_fraction': 0.9208568273576023, 'bagging_freq': 4, 'min_child_samples': 82, 'reg_alpha': 0.2859032350515154, 'reg_lambda': 0.5479889874459456, 'scale_pos_weight': 18.372059844104033}. Best is trial 74 with value: 0.16419194543061777.
[I 2025-07-01 14:25:25,632] Trial 102 finished with value: 0.16406466499911615 and parameters: {'num_leaves': 32, 'learning_rate': 0.0


Best LightGBM score: 0.16471195540014044
Best LightGBM params: {'num_leaves': 26, 'learning_rate': 0.08528860622379723, 'feature_fraction': 0.8956981926222365, 'bagging_fraction': 0.9559230623715453, 'bagging_freq': 6, 'min_child_samples': 37, 'reg_alpha': 0.14610629452598145, 'reg_lambda': 0.4221755419320733, 'scale_pos_weight': 9.487821937576706}

Average execution time LightGBM: 2.49s
Total optimization time LightGBM: 747.41s

Optimal thresholds (per fold) for best LightGBM trial: [0.11438099653272862, 0.12032968486245649, 0.1152799328179485, 0.11421011807726807, 0.11503159906818734]
Mean threshold for best LightGBM trial: 0.1158464662717178


In [19]:
# CatBoost study
cat_study = optuna.create_study(
    study_name="catboost_optimization",
    direction="maximize",
    storage=db_dir.format('cat_study'),
    load_if_exists=True
)
cat_study.optimize(lambda trial: catboost_objective(trial, X, y, skf), n_trials=n_trials)

print(f"\nBest CatBoost score: {cat_study.best_value}")
print(f"Best CatBoost params: {cat_study.best_params}")

execution_times = [t.user_attrs.get('execution_time', 0) for t in cat_study.trials]
print(f"\nAverage execution time CatBoost: {np.mean(execution_times):.2f}s")
print(f"Total optimization time CatBoost: {sum(execution_times):.2f}s")

thresholds = cat_study.best_trial.user_attrs.get('optimal_thresholds', [])
mean_threshold = cat_study.best_trial.user_attrs.get('threshold', None)
print(f"\nOptimal thresholds (per fold) for best CatBoost trial: {thresholds}")
print(f"Mean threshold for best CatBoost trial: {mean_threshold}")

[I 2025-07-01 14:32:48,698] A new study created in RDB with name: catboost_optimization
[I 2025-07-01 14:32:54,037] Trial 0 finished with value: 0.15591391064601506 and parameters: {'iterations': 129, 'learning_rate': 0.024700252365271166, 'depth': 8, 'l2_leaf_reg': 7.481835857359528, 'bagging_temperature': 0.03947132271883147, 'random_strength': 7.708461206380756, 'scale_pos_weight': 18.258966634411536}. Best is trial 0 with value: 0.15591391064601506.
[I 2025-07-01 14:32:58,583] Trial 1 finished with value: 0.15863492769054038 and parameters: {'iterations': 709, 'learning_rate': 0.04231009385589839, 'depth': 8, 'l2_leaf_reg': 8.168210309936246, 'bagging_temperature': 0.9282554591216171, 'random_strength': 8.424558186477004, 'scale_pos_weight': 15.51055168642786}. Best is trial 1 with value: 0.15863492769054038.
[I 2025-07-01 14:33:05,658] Trial 2 finished with value: 0.153938126756624 and parameters: {'iterations': 204, 'learning_rate': 0.11040977292121909, 'depth': 10, 'l2_leaf_reg'


Best CatBoost score: 0.17278392562461428
Best CatBoost params: {'iterations': 465, 'learning_rate': 0.08736447231630849, 'depth': 4, 'l2_leaf_reg': 7.7262873316856835, 'bagging_temperature': 0.5819985986909341, 'random_strength': 1.6075799412158576, 'scale_pos_weight': 12.162201666957955}

Average execution time CatBoost: 2.77s
Total optimization time CatBoost: 831.84s

Optimal thresholds (per fold) for best CatBoost trial: [0.4919791165647181, 0.5222307523631675, 0.5070795606125506, 0.536935228445065, 0.5321055144588883]
Mean threshold for best CatBoost trial: 0.5180660344888779


# Conclusion

In [None]:
xgb_study = optuna.load_study(
    study_name="xgboost_optimization",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization",
    storage=db_dir.format('rf_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization",
    storage=db_dir.format('histgb_study')
)
lgb_study = optuna.load_study(
    study_name="lightgbm_optimization",
    storage=db_dir.format('lgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization",
    storage=db_dir.format('cat_study')
)

In [20]:
print(f"Best XGBoost score: {xgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in xgb_study.trials]):.2f}s")
print(f"Best Random Forest score: {rf_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in rf_study.trials]):.2f}s")
print(f"Best HistGB score: {histgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in histgb_study.trials]):.2f}s")
print(f"Best LightGBM score: {lgb_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in lgb_study.trials]):.2f}s")
print(f"Best CatBoost score: {cat_study.best_value:.3f}, Avg time: {np.mean([t.user_attrs.get('execution_time', 0) for t in cat_study.trials]):.2f}s")

Best XGBoost score: 0.171, Avg time: 0.53s
Best Random Forest score: 0.524, Avg time: 25.24s
Best HistGB score: 0.171, Avg time: 6.02s
Best LightGBM score: 0.165, Avg time: 2.49s
Best CatBoost score: 0.173, Avg time: 2.77s
