# Table of Contents

1. [Imports and definitions](#imports-and-definitions)
2. [Load data and prepare data](#load-data-and-prepare-data)
3. [Single Meta Learner Evaluation](#single-meta-learner-evaluation)
4. [Segmented Meta Learners Evaluation](#segmented-meta-learners-evaluation)
5. [Final Meta Learner (Meta of Metas)](#final-meta-learner-meta-of-metas)
6. [Comprehensive Evaluation and Comparison](#comprehensive-evaluation-and-comparison)

---

# Imports and definitions

In [1]:
from pathlib import Path
import time
import pickle

import polars as pl
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc,
    confusion_matrix, classification_report, matthews_corrcoef
)

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')
warnings.filterwarnings('ignore', module='lightgbm')
warnings.filterwarnings('ignore', message='X does not have valid feature names, but LGBMClassifier was fitted with feature names')
warnings.filterwarnings('ignore', message='bagging_freq is set=6, subsample_freq=0 will be ignored')
warnings.filterwarnings('ignore', message='bagging_freq is set=7, subsample_freq=0 will be ignored')
warnings.filterwarnings('ignore', message='feature_fraction is set=*')
warnings.filterwarnings('ignore', message='bagging_fraction is set=*')

In [None]:
base_dir = Path('/workspaces/data-scientist-at-magenta')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
test_dir = data_dir / 'test'
db_dir = 'sqlite:///data/models/{}.db'
artifacts_dir = data_dir / "models/artifacts"

In [4]:
# Helper function to load model artifacts
def load_model_artifact(path):
    with open(path, "rb") as f:
        return pickle.load(f)

In [5]:
def evaluate_classification_model(y_true, y_pred, y_proba=None,
                                  model_name="Model", pos_label=1,
                                  plot_results=True, print_result=True):
    """Comprehensive evaluation function for classification models."""
    y_true_np = y_true.to_numpy()
    y_pred_np = y_pred.to_numpy() 

    results = {'model_name': model_name}

    results['accuracy'] = accuracy_score(y_true_np, y_pred_np)
    results['precision'] = precision_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['recall'] = recall_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['f1_score'] = f1_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['matthews_corr'] = matthews_corrcoef(y_true_np, y_pred_np)

    cm = confusion_matrix(y_true_np, y_pred_np)
    results['confusion_matrix'] = cm
    results['tn'], results['fp'], results['fn'], results['tp'] = cm.ravel()

    if y_proba is not None:
        y_proba_np = y_proba.to_numpy()
        results['roc_auc'] = roc_auc_score(y_true_np, y_proba_np)
        results['pr_auc'] = auc(*precision_recall_curve(y_true_np, y_proba_np)[:2][::-1])

    if plot_results:
        plot_evaluation_results(y_true_np, y_pred_np, y_proba, model_name, results)

    if print_result:
        print_evaluation_summary(results)

    return results


def print_evaluation_summary(results):
    """Print a formatted summary of evaluation results."""
    print(f"\n{'='*60}")
    print(f"EVALUATION SUMMARY: {results['model_name']}")
    print(f"{'='*60}")

    print(f"\nCORE METRICS:")
    print(f"  Accuracy:      {results['accuracy']:.4f}")
    print(f"  Precision:     {results['precision']:.4f}")
    print(f"  Recall:        {results['recall']:.4f}")
    print(f"  F1 Score:      {results['f1_score']:.4f}")
    print(f"  Matthews CC:   {results['matthews_corr']:.4f}")

    if 'roc_auc' in results:
        print(f"\nPROBABILITY-BASED METRICS:")
        print(f"  ROC AUC:       {results['roc_auc']:.4f}")
        print(f"  PR AUC:        {results['pr_auc']:.4f}")

    print(f"\nCONFUSION MATRIX:")
    print(f"  TN: {results['tn']:>6} | FP: {results['fp']:>6}")
    print(f"  FN: {results['fn']:>6} | TP: {results['tp']:>6}")


def compare_models(models_results):
    """Compare multiple models and return a comparison DataFrame."""
    comparison_data = []

    for result in models_results:
        row = {
            'Model': result['model_name'],
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1': result['f1_score'],
            'Matthews_CC': result['matthews_corr']
        }

        if 'roc_auc' in result:
            row.update({
                'ROC_AUC': result['roc_auc'],
                'PR_AUC': result['pr_auc']
            })

        comparison_data.append(row)

    comparison_df = pl.DataFrame(comparison_data)
    float_cols = [col for col, dtype in comparison_df.schema.items() if dtype == pl.Float64]
    comparison_df = comparison_df.with_columns([
        pl.col(col).round(4) for col in float_cols
    ])
    return comparison_df


def plot_evaluation_results(y_true, y_pred, y_proba, model_name, results):
    """Generate comprehensive evaluation plots using Plotly."""
    cm = results['confusion_matrix']
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Blues',
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 20}
    ))
    fig_cm.update_layout(
        title=f'Confusion Matrix: {model_name}',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        height=400, width=500
    )
    fig_cm.show()


def create_bins_split_features(X, y, split_configs, base_models, seed=42):
    """Create bins split features based on different column thresholds."""
    predictions_dict = {}
    
    for config in split_configs:
        name = config['name']
        column = config['column']
        threshold = config['threshold']
        
        model_b1, model_b2 = base_models[name]
        predictions = np.zeros(X.height)
        
        column_values = X.select(column).to_numpy().ravel()
        b1_mask = column_values < threshold
        b2_mask = ~b1_mask
        
        X_for_prediction = X.drop(column).with_row_index("idx")
        
        if np.any(b1_mask):
            b1_indices = np.where(b1_mask)[0]
            b1_X = X_for_prediction.filter(pl.col("idx").is_in(b1_indices)).drop("idx")
            b1_predictions = model_b1.predict(b1_X.to_numpy())
            predictions[b1_mask] = b1_predictions
        
        if np.any(b2_mask):
            b2_indices = np.where(b2_mask)[0]
            b2_X = X_for_prediction.filter(pl.col("idx").is_in(b2_indices)).drop("idx")
            b2_predictions = model_b2.predict(b2_X.to_numpy())
            predictions[b2_mask] = b2_predictions
        
        predictions_dict[name] = predictions
    
    predictions_dict['label'] = y.to_numpy().ravel()
    final_df = pl.DataFrame(predictions_dict)
    return final_df.sample(fraction=1.0, with_replacement=False, seed=seed)


def ridge_from_study_path(study_path, study_name):
    """Helper to extract best params and create RidgeClassifier"""
    study = optuna.load_study(study_name=study_name, storage=study_path)
    params = study.best_params.copy()
    valid_keys = {'alpha', 'solver', 'class_weight'}
    params = {k: v for k, v in params.items() if k in valid_keys}
    return RidgeClassifier(**params)

# Load data and prepare data

In [6]:
test = pl.read_parquet(test_dir / 'data-meta-v0-50.parquet')
X_test = test.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_test = test.select('has_done_upselling')

X_meta, X_final_meta, y_meta, y_final_meta = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
train__final_meta = pl.concat([X_meta, y_meta], how='horizontal')
final_test = pl.concat([X_final_meta, y_final_meta], how='horizontal')

In [7]:
train_base = pl.read_parquet(train_dir / 'data-v0-80.parquet')
train_meta = pl.read_parquet(train_dir / 'data-meta-v0-50.parquet')


train = pl.concat([train_base, train_meta, train__final_meta])

In [8]:
X_train = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train = train.select('has_done_upselling')

X_test = final_test.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_test = final_test.select('has_done_upselling')

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (95000, 49)
Test set shape: (5000, 49)


# 1. Single Meta Learner Evaluation

In [9]:
# Load base model studies and create single meta learner
print("Loading base model studies...")
xgb_study = optuna.load_study(study_name="xgboost_optimization", storage=db_dir.format('xgb_study'))
rf_study = optuna.load_study(study_name="random_forest_optimization_basef1", storage=db_dir.format('rf_study'))
histgb_study = optuna.load_study(study_name="histgb_optimization", storage=db_dir.format('histgb_study'))
cat_study = optuna.load_study(study_name="catboost_optimization", storage=db_dir.format('cat_study'))
meta_study = optuna.load_study(study_name="meta_ridge_base_optimization", storage=db_dir.format('meta_learners_study'))

# Create and train base models
print("Training base models...")
X_np, y_np = X_train.to_numpy(), y_train.to_numpy().ravel()

xgb_model = xgb.XGBClassifier(**xgb_study.best_params).fit(X_np, y_np)
rf_model = RandomForestClassifier(**rf_study.best_params).fit(X_np, y_np)
histgb_model = HistGradientBoostingClassifier(**histgb_study.best_params).fit(X_np, y_np)
cat_model = CatBoostClassifier(**cat_study.best_params, verbose=0).fit(X_np, y_np)

# Create single stacking classifier
single_stacking_clf = StackingClassifier(
    estimators=[("xgb", xgb_model), ("rf", rf_model), ("histgb", histgb_model), ("cat", cat_model)],
    final_estimator=RidgeClassifier(**meta_study.best_params),
    cv='prefit', stack_method='predict_proba'
).fit(X_np, y_np)

print("Single meta learner trained!")

Loading base model studies...
Training base models...
Single meta learner trained!


In [10]:
# Evaluate single meta learner and individual base models
y_pred_single = single_stacking_clf.predict(X_test.to_numpy())
y_true_pl = pl.Series("y_true", y_test.to_numpy().ravel())
y_pred_single_pl = pl.Series("y_pred", y_pred_single)

results_single = evaluate_classification_model(
    y_true=y_true_pl, y_pred=y_pred_single_pl, y_proba=None,
    model_name="Single Meta Learner (Stacking)", plot_results=False, print_result=True
)

# Initialize results list with single meta learner
results_list = [results_single]

print("\n" + "="*60)
print("EVALUATING INDIVIDUAL BASE MODELS WITH BEST THRESHOLDS")
print("="*60)


best_thresholds = {
    'xgb': xgb_study.best_trial.user_attrs.get('threshold', 0.5),
    'cat': cat_study.best_trial.user_attrs.get('threshold', 0.5),
    'hgb': histgb_study.best_trial.user_attrs.get('threshold', 0.5)
}

# Evaluate XGBoost with best threshold
y_proba_xgb = xgb_model.predict_proba(X_test.to_numpy())[:, 1]
y_pred_xgb = (y_proba_xgb > best_thresholds['xgb']).astype(int)
results_xgb = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_xgb),
    y_proba=pl.Series("y_proba", y_proba_xgb), 
    model_name="XGBoost (Best Threshold)",
    plot_results=False, print_result=True
)
results_list.append(results_xgb)

# Evaluate Random Forest with base threshold
y_proba_rf = rf_model.predict_proba(X_test.to_numpy())[:, 1]
y_pred_rf = (y_proba_rf > 0.5).astype(int)
results_rf = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_rf),
    y_proba=pl.Series("y_proba", y_proba_rf),
    model_name="Random Forest (Base Threshold)",
    plot_results=False, print_result=True
)
results_list.append(results_rf)

# Evaluate HistGradientBoosting with best threshold
y_proba_histgb = histgb_model.predict_proba(X_test.to_numpy())[:, 1]
y_pred_histgb = (y_proba_histgb > best_thresholds['hgb']).astype(int)
results_histgb = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_histgb),
    y_proba=pl.Series("y_proba", y_proba_histgb),
    model_name="HistGradientBoosting (Best Threshold)",
    plot_results=False, print_result=True
)
results_list.append(results_histgb)

# Evaluate CatBoost with best threshold
y_proba_cat = cat_model.predict_proba(X_test.to_numpy())[:, 1]
y_pred_cat = (y_proba_cat > best_thresholds['cat']).astype(int)
results_cat = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_cat),
    y_proba=pl.Series("y_proba", y_proba_cat),
    model_name="CatBoost (Best Threshold)",
    plot_results=False, print_result=True
)
results_list.append(results_cat)



EVALUATION SUMMARY: Single Meta Learner (Stacking)

CORE METRICS:
  Accuracy:      0.6486
  Precision:     0.0916
  Recall:        0.4398
  F1 Score:      0.1516
  Matthews CC:   0.0567

CONFUSION MATRIX:
  TN:   3086 | FP:   1557
  FN:    200 | TP:    157

EVALUATING INDIVIDUAL BASE MODELS WITH BEST THRESHOLDS

EVALUATION SUMMARY: XGBoost (Best Threshold)

CORE METRICS:
  Accuracy:      0.7362
  Precision:     0.1012
  Recall:        0.3417
  F1 Score:      0.1561
  Matthews CC:   0.0652

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5948
  PR AUC:        0.0924

CONFUSION MATRIX:
  TN:   3559 | FP:   1084
  FN:    235 | TP:    122

EVALUATION SUMMARY: Random Forest (Base Threshold)

CORE METRICS:
  Accuracy:      0.6344
  Precision:     0.1009
  Recall:        0.5210
  F1 Score:      0.1691
  Matthews CC:   0.0876

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6335
  PR AUC:        0.1138

CONFUSION MATRIX:
  TN:   2986 | FP:   1657
  FN:    171 | TP:    186

EVALUATION SUMMARY: His

# 2. Segmented Meta Learners Evaluation

In [11]:
# Load pre-trained segmented models
print("Loading segmented models...")

# Age models
base_models_age_b1 = [
    ('xg_age_b1', load_model_artifact(artifacts_dir / "pre_xgb_age_b1.pkl")),
    ('rf_clf_age_b1', load_model_artifact(artifacts_dir / "pre_rf_age_b1.pkl")),
    ('cat_clf_age_b1', load_model_artifact(artifacts_dir / "pre_cat_age_b1.pkl")),
    ('histgb_age_b1', load_model_artifact(artifacts_dir / "pre_histgb_age_b1.pkl"))
]
base_models_age_b2 = [
    ('xg_age_b2', load_model_artifact(artifacts_dir / "pre_xgb_age_b2.pkl")),
    ('rf_clf_age_b2', load_model_artifact(artifacts_dir / "pre_rf_age_b2.pkl")),
    ('histgb_age_b2', load_model_artifact(artifacts_dir / "pre_histgb_age_b2.pkl"))

]

# Days models  
base_models_days_b1 = [
    ('rf_days_b1', load_model_artifact(artifacts_dir / "pre_rf_days_b1.pkl")),
    ('lgb_days_b1', load_model_artifact(artifacts_dir / "pre_lgb_days_b1.pkl")),
    ('cat_days_b1', load_model_artifact(artifacts_dir / "pre_cat_days_b1.pkl"))
]
base_models_days_b2 = [
    ('rf_days_b2', load_model_artifact(artifacts_dir / "pre_rf_days_b2.pkl")),
    ('lgb_days_b2', load_model_artifact(artifacts_dir / "pre_lgb_days_b2.pkl")),
    ('cat_days_b2', load_model_artifact(artifacts_dir / "pre_cat_days_b2.pkl"))
]

# Data models
base_models_data_b1 = [
    ('xgb_data_b1', load_model_artifact(artifacts_dir / "pre_xgb_data_b1.pkl")),
    ('cat_data_b1', load_model_artifact(artifacts_dir / "pre_cat_data_b1.pkl"))
]
base_models_data_b2 = [
    ('xgb_data_b2', load_model_artifact(artifacts_dir / "pre_xgb_data_b2.pkl")),
    ('cat_data_b2', load_model_artifact(artifacts_dir / "pre_cat_data_b2.pkl"))
]

print(f"Loaded models - Age: B1({len(base_models_age_b1)}), B2({len(base_models_age_b2)})")
print(f"                Days: B1({len(base_models_days_b1)}), B2({len(base_models_days_b2)})")  
print(f"                Data: B1({len(base_models_data_b1)}), B2({len(base_models_data_b2)})")

Loading segmented models...
Loaded models - Age: B1(4), B2(3)
                Days: B1(3), B2(3)
                Data: B1(2), B2(2)


In [12]:
# Create and train segmented stacking classifiers
print("Creating segmented stackers...")

# Create stackers
stacking_age_b1 = StackingClassifier(
    estimators=base_models_age_b1,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b1_optimization"),
    cv='prefit', stack_method='predict_proba'
)
stacking_age_b2 = StackingClassifier(
    estimators=base_models_age_b2,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b2_optimization"),
    cv='prefit', stack_method='predict_proba'
)
stacking_days_b1 = StackingClassifier(
    estimators=base_models_days_b1,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b1_optimization"),
    cv='prefit', stack_method='predict_proba'
)
stacking_days_b2 = StackingClassifier(
    estimators=base_models_days_b2,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b2_optimization"),
    cv='prefit', stack_method='predict_proba'
)
stacking_data_b1 = StackingClassifier(
    estimators=base_models_data_b1,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b1_optimization"),
    cv='prefit', stack_method='predict_proba'
)
stacking_data_b2 = StackingClassifier(
    estimators=base_models_data_b2,
    final_estimator=ridge_from_study_path(db_dir.format('meta_learners_study'), "meta_ridge_age_b2_optimization"),
    cv='prefit', stack_method='predict_proba'
)

print("Segmented stackers created!")

Creating segmented stackers...
Segmented stackers created!


In [13]:
# Use the train data prepared at the beginning instead of creating new split
train_meta = train

# Create segment training data using the full train set
print("Preparing segment training data...")
data_age_b1 = train_meta.filter(pl.col("age") < 55).drop('age')
data_age_b2 = train_meta.filter(pl.col("age") >= 55).drop('age')

data_days_b1 = train_meta.filter(pl.col("contract_lifetime_days") < 1000).drop('contract_lifetime_days')
data_days_b2 = train_meta.filter(pl.col("contract_lifetime_days") >= 1000).drop('contract_lifetime_days')

data_data_b1 = train_meta.filter(pl.col("available_gb") < 25).drop('available_gb')
data_data_b2 = train_meta.filter(pl.col("available_gb") >= 25).drop('available_gb')

# Extract features and labels
X_train_age_b1 = data_age_b1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_age_b1 = data_age_b1.select('has_done_upselling')
X_train_age_b2 = data_age_b2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_age_b2 = data_age_b2.select('has_done_upselling')

X_train_days_b1 = data_days_b1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_days_b1 = data_days_b1.select('has_done_upselling')
X_train_days_b2 = data_days_b2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_days_b2 = data_days_b2.select('has_done_upselling')

X_train_data_b1 = data_data_b1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_data_b1 = data_data_b1.select('has_done_upselling')
X_train_data_b2 = data_data_b2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train_data_b2 = data_data_b2.select('has_done_upselling')

print("Segment training data prepared!")

Preparing segment training data...
Segment training data prepared!


In [14]:
# Train segmented stackers
print("Training segmented stackers...")
stacking_age_b1.fit(X_train_age_b1.to_numpy(), y_train_age_b1.to_numpy().ravel())
stacking_age_b2.fit(X_train_age_b2.to_numpy(), y_train_age_b2.to_numpy().ravel())
stacking_days_b1.fit(X_train_days_b1.to_numpy(), y_train_days_b1.to_numpy().ravel())
stacking_days_b2.fit(X_train_days_b2.to_numpy(), y_train_days_b2.to_numpy().ravel())
stacking_data_b1.fit(X_train_data_b1.to_numpy(), y_train_data_b1.to_numpy().ravel())
stacking_data_b2.fit(X_train_data_b2.to_numpy(), y_train_data_b2.to_numpy().ravel())
print("All segmented stackers trained!")

Training segmented stackers...
All segmented stackers trained!


# 3. Final Meta Learner (Meta of Metas)

In [15]:
# Create and train final meta learner
print("Creating final meta learner...")
final_meta_study = optuna.load_study(
    study_name="meta_ridge_final_decision_optimization",
    storage=db_dir.format('meta_learners_study')
)

split_configs = [
    {'name': 'age', 'column': 'age', 'threshold': 55},
    {'name': 'days', 'column': 'contract_lifetime_days', 'threshold': 1000}, 
    {'name': 'data', 'column': 'available_gb', 'threshold': 25}
]

base_models_dict = {
    'age': (stacking_age_b1, stacking_age_b2),
    'days': (stacking_days_b1, stacking_days_b2),
    'data': (stacking_data_b1, stacking_data_b2)
}

# Train final meta learner on the split
meta_train_X_y = create_bins_split_features(X_train, y_train, split_configs, base_models_dict)
final_meta_params = {k: v for k, v in final_meta_study.best_params.items() if k in {'alpha', 'solver', 'class_weight'}}
final_meta_clf = RidgeClassifier(**final_meta_params)
final_meta_clf.fit(meta_train_X_y.select(['age', 'days', 'data']).to_numpy(), 
                   meta_train_X_y.select('label').to_numpy().ravel())

print("Final meta learner trained!")

Creating final meta learner...
Final meta learner trained!


# 4. Comprehensive Evaluation and Comparison

In [16]:
# Evaluate final meta learner
print("Evaluating final meta learner...")
meta_test_X_y = create_bins_split_features(X_test, y_test, split_configs, base_models_dict)
y_pred_final = final_meta_clf.predict(meta_test_X_y.select(['age', 'days', 'data']).to_numpy())

results_final = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_final),
    y_proba=None,
    model_name="Final Meta Learner (Meta of Metas)",
    plot_results=False,
    print_result=True
)

# Add final meta learner to results list 
results_list.append(results_final)

Evaluating final meta learner...

EVALUATION SUMMARY: Final Meta Learner (Meta of Metas)

CORE METRICS:
  Accuracy:      0.7270
  Precision:     0.0929
  Recall:        0.3221
  F1 Score:      0.1442
  Matthews CC:   0.0479

CONFUSION MATRIX:
  TN:   3520 | FP:   1123
  FN:    242 | TP:    115


In [17]:
print("\n" + "="*60)
print("EVALUATING SINGLE-DIMENSION SEGMENTED META LEARNERS")
print("="*60)

# Age-based meta learner using create_bins_split_features
age_split_config = [{'name': 'age', 'column': 'age', 'threshold': 55}]
age_base_models = {'age': (stacking_age_b1, stacking_age_b2)}

age_test_features = create_bins_split_features(X_test, y_test, age_split_config, age_base_models)
y_pred_age_meta = age_test_features.select('age').to_numpy().ravel()

results_age_meta = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_age_meta),
    y_proba=None, model_name="Age-based Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_age_meta)

# Days-based meta learner using create_bins_split_features
days_split_config = [{'name': 'days', 'column': 'contract_lifetime_days', 'threshold': 1000}]
days_base_models = {'days': (stacking_days_b1, stacking_days_b2)}

days_test_features = create_bins_split_features(X_test, y_test, days_split_config, days_base_models)
y_pred_days_meta = days_test_features.select('days').to_numpy().ravel()

results_days_meta = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_days_meta),
    y_proba=None, model_name="Days-based Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_days_meta)

# Data-based meta learner using create_bins_split_features
data_split_config = [{'name': 'data', 'column': 'available_gb', 'threshold': 25}]
data_base_models = {'data': (stacking_data_b1, stacking_data_b2)}

data_test_features = create_bins_split_features(X_test, y_test, data_split_config, data_base_models)
y_pred_data_meta = data_test_features.select('data').to_numpy().ravel()

results_data_meta = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_data_meta),
    y_proba=None, model_name="Data-based Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_data_meta)

print("\n" + "="*60)
print("EVALUATING MULTI-DIMENSIONAL SEGMENTED META LEARNERS")
print("="*60)

# Age-Days combination using create_bins_split_features
age_days_split_config = [
    {'name': 'age', 'column': 'age', 'threshold': 55},
    {'name': 'days', 'column': 'contract_lifetime_days', 'threshold': 1000}
]
age_days_base_models = {
    'age': (stacking_age_b1, stacking_age_b2),
    'days': (stacking_days_b1, stacking_days_b2)
}

age_days_test_features = create_bins_split_features(X_test, y_test, age_days_split_config, age_days_base_models)
# Use age predictions as the final prediction for Age-Days meta learner
y_pred_age_days = age_days_test_features.select('age').to_numpy().ravel()

results_age_days = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_age_days),
    y_proba=None, model_name="Age-Days Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_age_days)

# Age-Data combination using create_bins_split_features
age_data_split_config = [
    {'name': 'age', 'column': 'age', 'threshold': 55},
    {'name': 'data', 'column': 'available_gb', 'threshold': 25}
]
age_data_base_models = {
    'age': (stacking_age_b1, stacking_age_b2),
    'data': (stacking_data_b1, stacking_data_b2)
}

age_data_test_features = create_bins_split_features(X_test, y_test, age_data_split_config, age_data_base_models)
# Use age predictions as the final prediction for Age-Data meta learner
y_pred_age_data = age_data_test_features.select('age').to_numpy().ravel()

results_age_data = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_age_data),
    y_proba=None, model_name="Age-Data Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_age_data)

# Days-Data combination using create_bins_split_features
days_data_split_config = [
    {'name': 'days', 'column': 'contract_lifetime_days', 'threshold': 1000},
    {'name': 'data', 'column': 'available_gb', 'threshold': 25}
]
days_data_base_models = {
    'days': (stacking_days_b1, stacking_days_b2),
    'data': (stacking_data_b1, stacking_data_b2)
}

days_data_test_features = create_bins_split_features(X_test, y_test, days_data_split_config, days_data_base_models)
# Use days predictions as the final prediction for Days-Data meta learner
y_pred_days_data = days_data_test_features.select('days').to_numpy().ravel()

results_days_data = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=pl.Series("y_pred", y_pred_days_data),
    y_proba=None, model_name="Days-Data Meta Learner",
    plot_results=False, print_result=True
)
results_list.append(results_days_data)


EVALUATING SINGLE-DIMENSION SEGMENTED META LEARNERS

EVALUATION SUMMARY: Age-based Meta Learner

CORE METRICS:
  Accuracy:      0.7078
  Precision:     0.1023
  Recall:        0.3978
  F1 Score:      0.1628
  Matthews CC:   0.0744

CONFUSION MATRIX:
  TN:   3397 | FP:   1246
  FN:    215 | TP:    142

EVALUATION SUMMARY: Days-based Meta Learner

CORE METRICS:
  Accuracy:      0.7270
  Precision:     0.0929
  Recall:        0.3221
  F1 Score:      0.1442
  Matthews CC:   0.0479

CONFUSION MATRIX:
  TN:   3520 | FP:   1123
  FN:    242 | TP:    115

EVALUATION SUMMARY: Data-based Meta Learner

CORE METRICS:
  Accuracy:      0.6052
  Precision:     0.0902
  Recall:        0.4986
  F1 Score:      0.1528
  Matthews CC:   0.0590

CONFUSION MATRIX:
  TN:   2848 | FP:   1795
  FN:    179 | TP:    178

EVALUATING MULTI-DIMENSIONAL SEGMENTED META LEARNERS

EVALUATION SUMMARY: Age-Days Meta Learner

CORE METRICS:
  Accuracy:      0.7078
  Precision:     0.1023
  Recall:        0.3978
  F1 Score:

In [18]:
# Final comparison and visualization
print("\n" + "="*80)
print("FINAL COMPARISON OF ALL META LEARNERS")
print("="*80)

comparison_df = compare_models(results_list)
print(comparison_df)

# Sort by F1 score
comparison_df_sorted = comparison_df.sort('F1', descending=True)
print("\n" + "="*80)
print("RANKED BY F1 SCORE:")
print("="*80)
print(comparison_df_sorted)

# Create performance visualization with bar plots
fig = go.Figure()
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'Matthews_CC']
models = comparison_df_sorted['Model'].to_list()

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

for i, metric in enumerate(metrics):
    fig.add_trace(go.Bar(
        name=metric,
        x=models,
        y=comparison_df_sorted[metric].to_list(),
        marker_color=colors[i],
        yaxis='y',
        offsetgroup=i
    ))

fig.update_layout(
    title='Comprehensive Meta Learner Performance Comparison',
    xaxis_title='Meta Learner Models',
    yaxis_title='Score',
    barmode='group',  # This ensures bars are grouped, not stacked
    height=600,
    width=1200,
    xaxis_tickangle=-45,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()

print("\n" + "="*80)
print("EVALUATION COMPLETED!")
print("="*80)


FINAL COMPARISON OF ALL META LEARNERS
shape: (12, 8)
┌────────────────────────┬──────────┬───────────┬────────┬────────┬─────────────┬─────────┬────────┐
│ Model                  ┆ Accuracy ┆ Precision ┆ Recall ┆ F1     ┆ Matthews_CC ┆ ROC_AUC ┆ PR_AUC │
│ ---                    ┆ ---      ┆ ---       ┆ ---    ┆ ---    ┆ ---         ┆ ---     ┆ ---    │
│ str                    ┆ f64      ┆ f64       ┆ f64    ┆ f64    ┆ f64         ┆ f64     ┆ f64    │
╞════════════════════════╪══════════╪═══════════╪════════╪════════╪═════════════╪═════════╪════════╡
│ Single Meta Learner    ┆ 0.6486   ┆ 0.0916    ┆ 0.4398 ┆ 0.1516 ┆ 0.0567      ┆ null    ┆ null   │
│ (Stacking)             ┆          ┆           ┆        ┆        ┆             ┆         ┆        │
│ XGBoost (Best          ┆ 0.7362   ┆ 0.1012    ┆ 0.3417 ┆ 0.1561 ┆ 0.0652      ┆ 0.5948  ┆ 0.0924 │
│ Threshold)             ┆          ┆           ┆        ┆        ┆             ┆         ┆        │
│ Random Forest (Base    ┆ 0.6344   ┆


EVALUATION COMPLETED!


# Conclusion|

The meta-learning evaluation revealed that traditional ensemble methods still outperform complex meta-learning architectures for this customer upselling
task. CatBoost with optimized threshold achieved the highest F1-score (0.1797), while Random Forest provided the best balance of recall (0.521) and practical
deployment considerations. The sophisticated "meta of metas" approach and various segmented meta-learners failed to deliver meaningful improvements over individual
optimized base models, suggesting that the additional complexity of hierarchical meta-learning does not justify the marginal or absent performance gains for this
specific classification problem. We decided to proceed with Random Forest as our final model due to its superior recall performance (0.521), which is critical for 
identifying potential upselling opportunities and minimizing missed revenue.m