# Imports and definitions

In [6]:
from pathlib import Path

import polars as pl
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc,
    confusion_matrix, classification_report, matthews_corrcoef
)

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [7]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [8]:
base_dir = Path('/Users/danlab/code/magenta-task/')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
test_dir = data_dir / 'test'
db_dir = 'sqlite:///data/models/{}.db'

In [28]:
def evaluate_classification_model(y_true, y_pred, y_proba=None,
                                  model_name="Model", pos_label=1,
                                  plot_results=True, print_result=True):
    """
    Comprehensive evaluation function for classification models.

    Parameters:
    -----------
    y_true : Polars Series or array-like
        True labels
    y_pred : Polars Series or array-like
        Predicted labels
    y_proba : Polars Series or array-like, optional
        Predicted probabilities for positive class
    model_name : str
        Name of the model for reporting
    pos_label : int or str
        Label of positive class
    plot_results : bool
        Whether to generate plots
    bootstrap_ci : bool
        Whether to compute bootstrap confidence intervals
    n_bootstrap : int
        Number of bootstrap samples
    confidence_level : float
        Confidence level for intervals

    Returns:
    --------
    dict : Dictionary containing all evaluation metrics
    """

    y_true_np = y_true.to_numpy()
    y_pred_np = y_pred.to_numpy() 

    results = {'model_name': model_name}

    results['accuracy'] = accuracy_score(y_true_np, y_pred_np)
    results['precision'] = precision_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['recall'] = recall_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['f1_score'] = f1_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['matthews_corr'] = matthews_corrcoef(y_true_np, y_pred_np)

    cm = confusion_matrix(y_true_np, y_pred_np)
    results['confusion_matrix'] = cm
    results['tn'], results['fp'], results['fn'], results['tp'] = cm.ravel()

    if y_proba is not None:
        y_proba_np = y_proba.to_numpy()
        results['roc_auc'] = roc_auc_score(y_true_np, y_proba_np)
        results['pr_auc'] = auc(*precision_recall_curve(y_true_np, y_proba_np)[:2][::-1])

    # Generate plots
    if plot_results:
        plot_evaluation_results(y_true_np, y_pred_np, y_proba, model_name, results)

    if print_result:
        print_evaluation_summary(results)

    return results


def plot_evaluation_results(y_true, y_pred, y_proba, model_name, results):
    """Generate comprehensive evaluation plots using Plotly."""
    figures = []

    # Confusion Matrix
    cm = results['confusion_matrix']
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Blues',
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 20}
    ))
    fig_cm.update_layout(
        title=f'Confusion Matrix: {model_name}',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=500
    )
    figures.append(fig_cm)

    # Classification Report as heatmap
    report_dict = classification_report(y_true, y_pred, output_dict=True)
    selected_classes = [key for key in report_dict if key not in ['accuracy', 'macro avg', 'weighted avg']]
    report_data = {
        'Metric': ['precision', 'recall', 'f1-score']
    }
    for cls in selected_classes:
        report_data[cls] = [
            report_dict[cls]['precision'],
            report_dict[cls]['recall'],
            report_dict[cls]['f1-score']
        ]

    df_report = pl.DataFrame(report_data)
    report_index = ['precision', 'recall', 'f1-score']
    df_report = df_report.with_columns(
        pl.Series("Metric", report_index).alias("Metric")
    )
    df_report = df_report.select(pl.col("Metric"), pl.exclude("Metric"))

    for col in df_report.columns:
        if col != "Metric":
            df_report = df_report.with_columns(pl.col(col).cast(pl.Float64))

    z_data = df_report.drop("Metric").to_numpy()
    x_labels = df_report.drop("Metric").columns
    y_labels = df_report["Metric"].to_list()

    fig_cr = go.Figure(data=go.Heatmap(
        z=z_data,
        x=x_labels,
        y=y_labels,
        colorscale='RdYlBu_r',
        text=np.around(z_data, decimals=3),
        texttemplate="%{text}",
        textfont={"size": 14}
    ))
    fig_cr.update_layout(
        title=f'Classification Report: {model_name}',
        xaxis_title='Class',
        yaxis_title='Metric',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=600
    )
    figures.append(fig_cr)

    if y_proba is not None:
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines',
                                     name=f'ROC (AUC = {results["roc_auc"]:.3f})',
                                     line=dict(width=2)))
        fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                                     name='Random Classifier',
                                     line=dict(dash='dash', color='grey')))
        fig_roc.update_layout(
            title=f'ROC Curve: {model_name}',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_roc)

        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_true, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=recall, y=precision, mode='lines',
                                    name=f'PR (AUC = {results["pr_auc"]:.3f})',
                                    line=dict(width=2)))
        fig_pr.add_trace(go.Scatter(x=[0, 1], y=[np.mean(y_true), np.mean(y_true)], mode='lines',
                                    name='Baseline',
                                    line=dict(dash='dash', color='grey')))
        fig_pr.update_layout(
            title=f'Precision-Recall Curve: {model_name}',
            xaxis_title='Recall',
            yaxis_title='Precision',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_pr)

        # Prediction Distribution
        fig_dist = go.Figure()
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 0], name='Negative Class',
                                        marker_color='red', opacity=0.6, histnorm='probability density'))
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 1], name='Positive Class',
                                        marker_color='blue', opacity=0.6, histnorm='probability density'))
        fig_dist.update_layout(
            title=f'Prediction Distribution: {model_name}',
            xaxis_title='Predicted Probability',
            yaxis_title='Density',
            barmode='overlay',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_dist)

    for fig in figures:
        fig.show()

    return figures

def print_evaluation_summary(results):
    """Print a formatted summary of evaluation results."""
    print(f"\n{'='*60}")
    print(f"EVALUATION SUMMARY: {results['model_name']}")
    print(f"{'='*60}")

    print(f"\nCORE METRICS:")
    print(f"  Accuracy:      {results['accuracy']:.4f}")
    print(f"  Precision:     {results['precision']:.4f}")
    print(f"  Recall:        {results['recall']:.4f}")
    print(f"  F1 Score:      {results['f1_score']:.4f}")
    print(f"  Matthews CC:   {results['matthews_corr']:.4f}")

    if 'roc_auc' in results:
        print(f"\nPROBABILITY-BASED METRICS:")
        print(f"  ROC AUC:       {results['roc_auc']:.4f}")
        print(f"  PR AUC:        {results['pr_auc']:.4f}")

    print(f"\nCONFUSION MATRIX:")
    print(f"  TN: {results['tn']:>6} | FP: {results['fp']:>6}")
    print(f"  FN: {results['fn']:>6} | TP: {results['tp']:>6}")
    

def compare_models(models_results):
    """Compare multiple models and return a comparison DataFrame."""
    comparison_data = []

    for result in models_results:
        row = {
            'Model': result['model_name'],
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1': result['f1_score'],
            'Matthews_CC': result['matthews_corr']
        }

        if 'roc_auc' in result:
            row.update({
                'ROC_AUC': result['roc_auc'],
                'PR_AUC': result['pr_auc']
            })

        comparison_data.append(row)

    comparison_df = pl.DataFrame(comparison_data)
    float_cols = [col for col, dtype in comparison_df.schema.items() if dtype == pl.Float64]
    comparison_df = comparison_df.with_columns([
        pl.col(col).round(4) for col in float_cols
    ])
    return comparison_df

# Load data

In [11]:
%%time

train_base = pl.read_parquet(train_dir / 'data-v0-80.parquet')
train_meta = pl.read_parquet(train_dir / 'data-meta-v0-50.parquet')

train = pl.concat([train_base, train_meta])


CPU times: user 18.3 ms, sys: 9.26 ms, total: 27.5 ms
Wall time: 30.4 ms


In [12]:
%%time

test = pl.read_parquet(test_dir / 'data-meta-v0-50.parquet')

CPU times: user 3.81 ms, sys: 3.48 ms, total: 7.28 ms
Wall time: 7.11 ms


# Prepare data

In [13]:
X_train = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train = train.select('has_done_upselling')

In [14]:
# Testing on hold out test

X = test.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = test.select('has_done_upselling')

In [15]:
X_np = X_train.to_numpy()
y_np = y_train.to_numpy().ravel()

y_true_np = y.to_numpy().ravel()

# Evaluation

In [17]:
xgb_study = optuna.load_study(
    study_name="xgboost_optimization",
    storage=db_dir.format('xgb_study')
)
rf_study = optuna.load_study(
    study_name="random_forest_optimization_basef1",
    storage=db_dir.format('rf_study')
)
histgb_study = optuna.load_study(
    study_name="histgb_optimization",
    storage=db_dir.format('histgb_study')
)
cat_study = optuna.load_study(
    study_name="catboost_optimization",
    storage=db_dir.format('cat_study')
)
meta_study = optuna.load_study(
    study_name="meta_ridge_base_optimization",
    storage=db_dir.format('meta_learners_study')
)

xgb_best_params = xgb_study.best_params
rf_best_params = rf_study.best_params
histgb_best_params = histgb_study.best_params
cat_best_params = cat_study.best_params
meta_params = meta_study.best_params

xgb_model = xgb.XGBClassifier(**xgb_best_params)
rf_model = RandomForestClassifier(**rf_best_params)
histgb_model = HistGradientBoostingClassifier(**histgb_best_params)
cat_model = CatBoostClassifier(**cat_best_params, verbose=0)

xgb_model.fit(X_np, y_np)
rf_model.fit(X_np, y_np)
histgb_model.fit(X_np, y_np)
cat_model.fit(X_np, y_np)


base_models = [
    ("xgb", xgb_model),
    ("rf", rf_model),
    ("histgb", histgb_model),
    ("cat", cat_model)
]

stacking_clf = StackingClassifier(
    estimators=base_models,  
    final_estimator=RidgeClassifier(**meta_params),
    cv='prefit',  # No need for the base models to be trained
    stack_method='predict_proba'
)

stacking_clf.fit(X_np, y_np)


0,1,2
,estimators,"[('xgb', ...), ('rf', ...), ...]"
,final_estimator,RidgeClassifi... solver='svd')
,cv,'prefit'
,stack_method,'predict_proba'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9555845404311799
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,864
,criterion,'gini'
,max_depth,19
,min_samples_split,19
,min_samples_leaf,1
,min_weight_fraction_leaf,0.015327799848051129
,max_features,'sqrt'
,max_leaf_nodes,154
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'log_loss'
,learning_rate,0.011946188388124244
,max_iter,122
,max_leaf_nodes,31
,max_depth,3
,min_samples_leaf,88
,l2_regularization,0.4294702816204709
,max_features,1.0
,max_bins,231
,categorical_features,'from_dtype'

0,1,2
,alpha,0.011675306544253592
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,class_weight,'balanced'
,solver,'svd'
,positive,False
,random_state,


In [29]:
# Classification with best threshold

print(f"Best XGBoost mean threshold: {stacking_clf}")
y_pred_np = stacking_clf.predict(X.to_numpy())

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)


# Evaluate a single model
print("Evaluating Stacking Classifier (Ridge):")
results_stacking = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=None,
    model_name="Stacking Classifier (Ridge)",
    plot_results=True,
    print_result=True
)

Best XGBoost mean threshold: StackingClassifier(cv='prefit',
                   estimators=[('xgb',
                                XGBClassifier(alpha=1.4481041360747245e-08,
                                              base_score=None, booster='gbtree',
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.9555845404311799,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eta=0.00898977016906825,
                                              eval_metric=None,
                                              feature_types=None,
                                              feat...


EVALUATION SUMMARY: Stacking Classifier (Ridge)

CORE METRICS:
  Accuracy:      0.6579
  Precision:     0.0900
  Recall:        0.4227
  F1 Score:      0.1484
  Matthews CC:   0.0535

CONFUSION MATRIX:
  TN:   6281 | FP:   3014
  FN:    407 | TP:    298


compared to single models stacking has worse performances