# Imports and definitions

# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Load data](#load-data)
3. [Prepare data](#prepare-data)
4. [Define objectives](#define-objectives)
5. [Start tuning](#start-tuning)
6. [Conclusion](#conclusion)

---

In [69]:
from pathlib import Path
import time

import polars as pl
import numpy as np

import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import StackingClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [3]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
db_dir = 'sqlite:///data/models/{}.db'


# Load data

In [48]:
%%time

train_meta = pl.read_parquet(train_dir / 'data-meta-v0-50.parquet')
train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 19.4 ms, sys: 7.76 ms, total: 27.1 ms
Wall time: 28.7 ms


# Prepare data

In [49]:
X_meta = train_meta.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_meta = train_meta.select('has_done_upselling')

In [50]:
X = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = train.select('has_done_upselling')

In [51]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define objectives

In [63]:
def stacking_objective(trial, X, y, skf, base_models):
    # Define Ridge parameters to tune
    ridge_params = {
        'alpha': trial.suggest_float('alpha', 1e-4, 100.0, log=True),
        'solver': trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr']),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced'])
    }
    
    # Create stacking classifier with Ridge meta-learner
    stacking_clf = StackingClassifier(
        estimators=base_models,  
        final_estimator=RidgeClassifier(**ridge_params),
        cv='prefit',  # No need for the base models to be trained
        stack_method='predict_proba'
    )
    
    cv_scores = []
    optimal_thresholds = []
    
    # Use the same cross-validation approach as base models
    for train_idx, valid_idx in skf.split(X, y):
        # Split data
        train_x, valid_x = X[train_idx], X[valid_idx] 
        train_y, valid_y = y[train_idx], y[valid_idx]
        
        # Train stacking classifier
        stacking_clf.fit(train_x.to_numpy(), train_y.to_numpy().ravel())
        
        # Get probabilities
        preds = stacking_clf.predict(valid_x.to_numpy())
        
        # Find optimal threshold using your existing function
        f1 = f1_score(valid_y.to_numpy().ravel(), preds)
        
        cv_scores.append(f1)
    
    return np.mean(cv_scores)

In [79]:
def stacking_logistic_objective(trial, X, y, skf, base_models):
    
    # Define Logistic Regression parameters to tune (simplified)
    logistic_params = {
        'C': trial.suggest_float('C', 1e-4, 100.0, log=True),
        'penalty': 'l2',  # Fixed to l2
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)
        # solver will use default ('lbfgs' for small datasets)
    }
    
    # Create stacking classifier with Logistic Regression meta-learner
    stacking_clf = StackingClassifier(
        estimators=base_models,  
        final_estimator=LogisticRegression(**logistic_params),
        cv='prefit',  # No need for the base models to be trained
        stack_method='predict_proba'
    )
    
    cv_scores = []
    optimal_thresholds = []
    
    # Use the same cross-validation approach as base models
    for train_idx, valid_idx in skf.split(X, y):
        # Split data
        train_x, valid_x = X[train_idx], X[valid_idx] 
        train_y, valid_y = y[train_idx], y[valid_idx]
        
        # Train stacking classifier
        stacking_clf.fit(train_x.to_numpy(), train_y.to_numpy().ravel())
        
        # Get predictions
        preds = stacking_clf.predict(valid_x.to_numpy())
        
        # Calculate F1 score
        f1 = f1_score(valid_y.to_numpy().ravel(), preds)
        
        cv_scores.append(f1)
    
    trial.set_user_attr('optimal_thresholds', optimal_thresholds)
    
    return np.mean(cv_scores)

# Start tuning

In [84]:
# Setup
n_trials = 100

xgb_study = optuna.load_study(
    study_name="xgboost_optimization",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization_basef1",
    storage=db_dir.format('rf_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization",
    storage=db_dir.format('histgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization",
    storage=db_dir.format('cat_study')
)

xgb_best_params = xgb_study.best_params
rf_best_params = rf_study.best_params
histgb_best_params = histgb_study.best_params
cat_best_params = cat_study.best_params

xgb_model = xgb.XGBClassifier(**xgb_best_params)
rf_model = RandomForestClassifier(**rf_best_params)
histgb_model = HistGradientBoostingClassifier(**histgb_best_params)
cat_model = CatBoostClassifier(**cat_best_params, verbose=0)

xgb_model.fit(X.to_numpy(), y.to_numpy().ravel())
rf_model.fit(X.to_numpy(), y.to_numpy().ravel())
histgb_model.fit(X.to_numpy(), y.to_numpy().ravel())
cat_model.fit(X.to_numpy(), y.to_numpy().ravel())

base_models = [
    ("xgb", xgb_model),
    ("rf", rf_model),
    ("histgb", histgb_model),
    ("cat", cat_model)
]

In [87]:
meta_study = optuna.create_study(
    study_name="meta_ridge_base_optimization",
    direction="maximize",
    storage=db_dir.format('meta_learners_study'),
    load_if_exists=True
)
meta_study.optimize(lambda trial: stacking_objective(trial, X_meta, y_meta, skf, base_models), n_trials=n_trials)
print(f"\nBest Stacking score: {meta_study.best_value}")
print(f"Best Stacking params: {meta_study.best_params}")

[I 2025-07-02 01:18:46,367] Using an existing study with name 'meta_ridge_base_optimization' instead of creating a new one.
[I 2025-07-02 01:18:48,574] Trial 105 finished with value: 0.1581749565659604 and parameters: {'alpha': 0.012436062963279641, 'solver': 'svd', 'class_weight': 'balanced'}. Best is trial 22 with value: 0.1581749565659604.
[I 2025-07-02 01:18:50,450] Trial 106 finished with value: 0.1581749565659604 and parameters: {'alpha': 0.01166932918496607, 'solver': 'svd', 'class_weight': 'balanced'}. Best is trial 22 with value: 0.1581749565659604.
[I 2025-07-02 01:18:52,447] Trial 107 finished with value: 0.15798777173061912 and parameters: {'alpha': 0.0240754273765649, 'solver': 'svd', 'class_weight': 'balanced'}. Best is trial 22 with value: 0.1581749565659604.
[I 2025-07-02 01:18:54,859] Trial 108 finished with value: 0.1571586081725586 and parameters: {'alpha': 0.007240289584056439, 'solver': 'svd', 'class_weight': 'balanced'}. Best is trial 22 with value: 0.158174956565


Best Stacking score: 0.1581749565659604
Best Stacking params: {'alpha': 0.011675306544253592, 'solver': 'svd', 'class_weight': 'balanced'}


In [None]:
meta_study = optuna.create_study(
    study_name="meta__logistic_base_optimization",
    direction="maximize",
    storage=db_dir.format('meta_learners_study'),
    load_if_exists=True
)
meta_study.optimize(lambda trial: stacking_logistic_objective(trial, X_meta, y_meta, skf, base_models), n_trials=n_trials)
print(f"\nBest Stacking score: {meta_study.best_value}")
print(f"Best Stacking params: {meta_study.best_params}")

[I 2025-07-02 01:12:09,665] A new study created in RDB with name: meta__logistic_base_optimization
[I 2025-07-02 01:12:11,455] Trial 0 finished with value: 0.0 and parameters: {'C': 0.001215388901974338, 'class_weight': None, 'max_iter': 954}. Best is trial 0 with value: 0.0.
[I 2025-07-02 01:12:13,250] Trial 1 finished with value: 0.0 and parameters: {'C': 2.166109755504578, 'class_weight': None, 'max_iter': 414}. Best is trial 0 with value: 0.0.
[I 2025-07-02 01:12:15,221] Trial 2 finished with value: 0.154624044163239 and parameters: {'C': 0.42004714962249096, 'class_weight': 'balanced', 'max_iter': 306}. Best is trial 2 with value: 0.154624044163239.
[I 2025-07-02 01:12:17,015] Trial 3 finished with value: 0.1560070220022802 and parameters: {'C': 20.034698103906024, 'class_weight': 'balanced', 'max_iter': 641}. Best is trial 3 with value: 0.1560070220022802.
[I 2025-07-02 01:12:18,788] Trial 4 finished with value: 0.15462676533703168 and parameters: {'C': 0.42821333727256844, 'clas


Best Stacking score: 0.15719259140971192
Best Stacking params: {'C': 0.13754162540171044, 'class_weight': 'balanced', 'max_iter': 458}


: 