# Imports and definitions

In [1]:
from pathlib import Path

import polars as pl
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc,
    confusion_matrix, classification_report, matthews_corrcoef
)

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [2]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [3]:
base_dir = Path('/Users/danlab/code/magenta-task/')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
test_dir = data_dir / 'test'
db_dir = 'sqlite:///data/models/{}.db'

In [4]:
def evaluate_classification_model(y_true, y_pred, y_proba=None,
                                  model_name="Model", pos_label=1,
                                  plot_results=True, print_result=True):
    """
    Comprehensive evaluation function for classification models.

    Parameters:
    -----------
    y_true : Polars Series or array-like
        True labels
    y_pred : Polars Series or array-like
        Predicted labels
    y_proba : Polars Series or array-like, optional
        Predicted probabilities for positive class
    model_name : str
        Name of the model for reporting
    pos_label : int or str
        Label of positive class
    plot_results : bool
        Whether to generate plots
    bootstrap_ci : bool
        Whether to compute bootstrap confidence intervals
    n_bootstrap : int
        Number of bootstrap samples
    confidence_level : float
        Confidence level for intervals

    Returns:
    --------
    dict : Dictionary containing all evaluation metrics
    """

    y_true_np = y_true.to_numpy()
    y_pred_np = y_pred.to_numpy() 
    y_proba_np = y_proba.to_numpy()

    results = {'model_name': model_name}

    results['accuracy'] = accuracy_score(y_true_np, y_pred_np)
    results['precision'] = precision_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['recall'] = recall_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['f1_score'] = f1_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['matthews_corr'] = matthews_corrcoef(y_true_np, y_pred_np)

    cm = confusion_matrix(y_true_np, y_pred_np)
    results['confusion_matrix'] = cm
    results['tn'], results['fp'], results['fn'], results['tp'] = cm.ravel()

    if y_proba_np is not None:
        results['roc_auc'] = roc_auc_score(y_true_np, y_proba_np)
        results['pr_auc'] = auc(*precision_recall_curve(y_true_np, y_proba_np)[:2][::-1])

    # Generate plots
    if plot_results:
        plot_evaluation_results(y_true_np, y_pred_np, y_proba_np, model_name, results)

    if print_result:
        print_evaluation_summary(results)

    return results


def plot_evaluation_results(y_true, y_pred, y_proba, model_name, results):
    """Generate comprehensive evaluation plots using Plotly."""
    figures = []

    # Confusion Matrix
    cm = results['confusion_matrix']
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Blues',
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 20}
    ))
    fig_cm.update_layout(
        title=f'Confusion Matrix: {model_name}',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=500
    )
    figures.append(fig_cm)

    # Classification Report as heatmap
    report_dict = classification_report(y_true, y_pred, output_dict=True)
    selected_classes = [key for key in report_dict if key not in ['accuracy', 'macro avg', 'weighted avg']]
    report_data = {
        'Metric': ['precision', 'recall', 'f1-score']
    }
    for cls in selected_classes:
        report_data[cls] = [
            report_dict[cls]['precision'],
            report_dict[cls]['recall'],
            report_dict[cls]['f1-score']
        ]

    df_report = pl.DataFrame(report_data)
    report_index = ['precision', 'recall', 'f1-score']
    df_report = df_report.with_columns(
        pl.Series("Metric", report_index).alias("Metric")
    )
    df_report = df_report.select(pl.col("Metric"), pl.exclude("Metric"))

    for col in df_report.columns:
        if col != "Metric":
            df_report = df_report.with_columns(pl.col(col).cast(pl.Float64))

    z_data = df_report.drop("Metric").to_numpy()
    x_labels = df_report.drop("Metric").columns
    y_labels = df_report["Metric"].to_list()

    fig_cr = go.Figure(data=go.Heatmap(
        z=z_data,
        x=x_labels,
        y=y_labels,
        colorscale='RdYlBu_r',
        text=np.around(z_data, decimals=3),
        texttemplate="%{text}",
        textfont={"size": 14}
    ))
    fig_cr.update_layout(
        title=f'Classification Report: {model_name}',
        xaxis_title='Class',
        yaxis_title='Metric',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=600
    )
    figures.append(fig_cr)

    if y_proba is not None:
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines',
                                     name=f'ROC (AUC = {results["roc_auc"]:.3f})',
                                     line=dict(width=2)))
        fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                                     name='Random Classifier',
                                     line=dict(dash='dash', color='grey')))
        fig_roc.update_layout(
            title=f'ROC Curve: {model_name}',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_roc)

        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_true, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=recall, y=precision, mode='lines',
                                    name=f'PR (AUC = {results["pr_auc"]:.3f})',
                                    line=dict(width=2)))
        fig_pr.add_trace(go.Scatter(x=[0, 1], y=[np.mean(y_true), np.mean(y_true)], mode='lines',
                                    name='Baseline',
                                    line=dict(dash='dash', color='grey')))
        fig_pr.update_layout(
            title=f'Precision-Recall Curve: {model_name}',
            xaxis_title='Recall',
            yaxis_title='Precision',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_pr)

        # Prediction Distribution
        fig_dist = go.Figure()
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 0], name='Negative Class',
                                        marker_color='red', opacity=0.6, histnorm='probability density'))
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 1], name='Positive Class',
                                        marker_color='blue', opacity=0.6, histnorm='probability density'))
        fig_dist.update_layout(
            title=f'Prediction Distribution: {model_name}',
            xaxis_title='Predicted Probability',
            yaxis_title='Density',
            barmode='overlay',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_dist)

    for fig in figures:
        fig.show()

    return figures

def print_evaluation_summary(results):
    """Print a formatted summary of evaluation results."""
    print(f"\n{'='*60}")
    print(f"EVALUATION SUMMARY: {results['model_name']}")
    print(f"{'='*60}")

    print(f"\nCORE METRICS:")
    print(f"  Accuracy:      {results['accuracy']:.4f}")
    print(f"  Precision:     {results['precision']:.4f}")
    print(f"  Recall:        {results['recall']:.4f}")
    print(f"  F1 Score:      {results['f1_score']:.4f}")
    print(f"  Matthews CC:   {results['matthews_corr']:.4f}")

    if 'roc_auc' in results:
        print(f"\nPROBABILITY-BASED METRICS:")
        print(f"  ROC AUC:       {results['roc_auc']:.4f}")
        print(f"  PR AUC:        {results['pr_auc']:.4f}")

    print(f"\nCONFUSION MATRIX:")
    print(f"  TN: {results['tn']:>6} | FP: {results['fp']:>6}")
    print(f"  FN: {results['fn']:>6} | TP: {results['tp']:>6}")
    

def compare_models(models_results):
    """Compare multiple models and return a comparison DataFrame."""
    comparison_data = []

    for result in models_results:
        row = {
            'Model': result['model_name'],
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1': result['f1_score'],
            'Matthews_CC': result['matthews_corr']
        }

        if 'roc_auc' in result:
            row.update({
                'ROC_AUC': result['roc_auc'],
                'PR_AUC': result['pr_auc']
            })

        comparison_data.append(row)

    comparison_df = pl.DataFrame(comparison_data)
    float_cols = [col for col, dtype in comparison_df.schema.items() if dtype == pl.Float64]
    comparison_df = comparison_df.with_columns([
        pl.col(col).round(4) for col in float_cols
    ])
    return comparison_df

In [5]:
def plot_model_scores(data):
    """
    Plot model scores (XGBoost, CatBoost, RandomForest, LightGBM, HistGradientBoosting) as lines and correctness as colored markers.

    Args:
        data (pl.DataFrame): DataFrame with columns ['xgb_score', 'cat_score', 'rf_score', 'lgb_score', 'hgb_score', 'xgb_pred', 'cat_pred', 'rf_pred', 'lgb_pred', 'hgb__pred'].
    """

    x = data.with_row_index()["index"].to_numpy()
    # Try to get label column, fallback to 'label' if present, else None
    label = data["label"].to_numpy() if "label" in data.columns else None

    def get_colors(pred_col):
        preds = data[pred_col].to_numpy()
        return np.where(preds == label, "green", "red")

    fig = make_subplots(
        rows=5, cols=1, shared_xaxes=True,
        subplot_titles=[
            "XGBoost Score", "CatBoost Score", "RandomForest Score", "LightGBM Score", "HistGradientBoosting Score"
        ]
    )

    # XGBoost
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["xgb_score"].to_numpy(),
            mode='lines',
            name='XGBoost Line',
            line=dict(color='royalblue'),
            showlegend=True
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["xgb_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("xgb_pred")),
            name='XGBoost Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=1, col=1
    )

    # CatBoost
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["cat_score"].to_numpy(),
            mode='lines',
            name='CatBoost Line',
            line=dict(color='orange'),
            showlegend=True
        ),
        row=2, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["cat_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("cat_pred")),
            name='CatBoost Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=2, col=1
    )

    # RandomForest
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["rf_score"].to_numpy(),
            mode='lines',
            name='RandomForest Line',
            line=dict(color='green'),
            showlegend=True
        ),
        row=3, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["rf_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("rf_pred")),
            name='RandomForest Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=3, col=1
    )

    # LightGBM
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["lgb_score"].to_numpy(),
            mode='lines',
            name='LightGBM Line',
            line=dict(color='purple'),
            showlegend=True
        ),
        row=4, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["lgb_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("lgb_pred")),
            name='LightGBM Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=4, col=1
    )

    # HistGradientBoosting
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["hgb_score"].to_numpy() if "hgb_score" in data.columns else data["hgb__score"].to_numpy(),
            mode='lines',
            name='HistGradientBoosting Line',
            line=dict(color='#FFA15A'),
            showlegend=True
        ),
        row=5, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["hgb_score"].to_numpy() if "hgb_score" in data.columns else data["hgb__score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("hgb__pred")),
            name='HistGradientBoosting Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=5, col=1
    )

    fig.update_layout(height=1500, width=1600, title_text="Model Scores: Line + Correctness Scatter")
    fig.show()

# Load data

In [6]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 14.6 ms, sys: 8.31 ms, total: 23 ms
Wall time: 23.4 ms


In [7]:
%%time

test = pl.read_parquet(test_dir / 'data-v0-20.parquet')

CPU times: user 4.49 ms, sys: 2.46 ms, total: 6.96 ms
Wall time: 7.77 ms


# Prepare data

In [8]:
days_b_1 = train.filter(pl.col("contract_lifetime_days") < 1000).drop('contract_lifetime_days')
days_b_2 = train.filter(pl.col("contract_lifetime_days") >= 1000).drop('contract_lifetime_days')

In [9]:
X_train_b1 = days_b_1.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
X_train_b2 = days_b_2.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))

y_train_b1 = days_b_1.select('has_done_upselling')
y_train_b2 = days_b_2.select('has_done_upselling')


In [10]:
# Testing on hold out test for b1 and b2

X_b1 = test.filter(pl.col("age") < 55).drop('age').select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_b1 = test.filter(pl.col("age") < 55).drop('age').select('has_done_upselling')

X_b2 = test.filter(pl.col("age") >= 55).drop('age').select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_b2 = test.filter(pl.col("age") >= 55).drop('age').select('has_done_upselling')

In [11]:
X_np_b1 = X_train_b1.to_numpy()
y_np_b1 = y_train_b1.to_numpy().ravel()

X_np_b2 = X_train_b2.to_numpy()
y_np_b2 = y_train_b2.to_numpy().ravel()

# For b1
y_true_np_b1 = y_train_b1.to_numpy().ravel()
# For b2
y_true_np_b2 = y_train_b2.to_numpy().ravel()

# Evaluation

## XGBoost

In [12]:
xgb_study_b1 = optuna.load_study(study_name="xgboost_optimization_days_b_1", storage=db_dir.format('xgb_study'))

b1_xgb_params = xgb_study_b1.best_params
b1_xgb_threshold = xgb_study_b1.best_trial.user_attrs.get('threshold', None)

xgb_model_b1 = xgb.XGBClassifier(**b1_xgb_params,)
xgb_model_b1.fit(X_np_b1, y_np_b1)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6927263365043989
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
# Classification with best threshold

print(f"Best XGBoost mean threshold: {b1_xgb_threshold}")
y_proba_np = xgb_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b1_xgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b1.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating XGBoost Model (b1):")
results_xgboost_b1 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="XGBoost (b1)",
    plot_results=True,
    print_result=True
)


Best XGBoost mean threshold: 0.17119102478027343
Evaluating XGBoost Model (b1):



EVALUATION SUMMARY: XGBoost (b1)

CORE METRICS:
  Accuracy:      0.9175
  Precision:     0.1228
  Recall:        0.0216
  F1 Score:      0.0367
  Matthews CC:   0.0219

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5745
  PR AUC:        0.0945

CONFUSION MATRIX:
  TN:  16301 | FP:    200
  FN:   1269 | TP:     28


In [36]:
xgb_study_b2 = optuna.load_study(study_name="xgboost_optimization_days_b_2", storage=db_dir.format('xgb_study'))

b2_xgb_params = xgb_study_b2.best_params
b2_xgb_threshold = xgb_study_b2.best_trial.user_attrs.get('threshold', None)

xgb_model_b2 = xgb.XGBClassifier(n_jobs=-1, **b2_xgb_params,)
xgb_model_b2.fit(X_np_b2, y_np_b2)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'dart'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7076785897731896
,device,
,early_stopping_rounds,
,enable_categorical,False


In [37]:
# Classification with best threshold

print(f"Best XGBoost mean threshold: {b2_xgb_threshold}")
y_proba_np = xgb_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b2_xgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b2.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating XGBoost Model (b2):")
results_xgboost_b2 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="XGBoost (b2)",
    plot_results=True,
    print_result=True
)


Best XGBoost mean threshold: 0.45091499090194703
Evaluating XGBoost Model (b2):



EVALUATION SUMMARY: XGBoost (b2)

CORE METRICS:
  Accuracy:      0.8624
  Precision:     0.0833
  Recall:        0.1681
  F1 Score:      0.1114
  Matthews CC:   0.0493

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6224
  PR AUC:        0.0827

CONFUSION MATRIX:
  TN:   1880 | FP:    209
  FN:     94 | TP:     19


---

## Catboost

In [15]:
cat_study_b1 = optuna.load_study(study_name="catboost_optimization_days_b_1", storage=db_dir.format('cat_study'))

b1_cat_params = cat_study_b1.best_params
b1_cat_threshold = cat_study_b1.best_trial.user_attrs.get('threshold', None)

cat_model_b1 = CatBoostClassifier(**b1_cat_params, verbose=0)
cat_model_b1.fit(X_np_b1, y_np_b1)


<catboost.core.CatBoostClassifier at 0x3161331d0>

In [16]:
# Classification with best threshold for CatBoost (b1)

print(f"Best CatBoost mean threshold: {b1_cat_threshold}")

y_proba_np = cat_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b1_cat_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b1.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating CatBoost Model (b1):")
results_catboost_b1 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="CatBoost (b1)",
    plot_results=True,
    print_result=True
)


Best CatBoost mean threshold: 0.4902076305746991
Evaluating CatBoost Model (b1):



EVALUATION SUMMARY: CatBoost (b1)

CORE METRICS:
  Accuracy:      0.8126
  Precision:     0.0978
  Recall:        0.1912
  F1 Score:      0.1294
  Matthews CC:   0.0391

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5289
  PR AUC:        0.0818

CONFUSION MATRIX:
  TN:  14214 | FP:   2287
  FN:   1049 | TP:    248


In [17]:
cat_study_b2 = optuna.load_study(study_name="catboost_optimization_days_b_2", storage=db_dir.format('cat_study'))

b2_cat_params = cat_study_b2.best_params
b2_cat_threshold = cat_study_b2.best_trial.user_attrs.get('threshold', None)

cat_model_b2 = CatBoostClassifier(**b2_cat_params, verbose=0)
cat_model_b2.fit(X_np_b2, y_np_b2)


<catboost.core.CatBoostClassifier at 0x31ba955e0>

In [18]:
# Classification with best threshold for CatBoost (b2)

print(f"Best CatBoost mean threshold: {b2_cat_threshold}")

y_proba_np = cat_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b2_cat_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b2.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating CatBoost Model (b2):")
results_catboost_b2 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="CatBoost (b2)",
    plot_results=True,
    print_result=True
)


Best CatBoost mean threshold: 0.5077768000452542
Evaluating CatBoost Model (b2):



EVALUATION SUMMARY: CatBoost (b2)

CORE METRICS:
  Accuracy:      0.9105
  Precision:     0.1182
  Recall:        0.1150
  F1 Score:      0.1166
  Matthews CC:   0.0695

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6204
  PR AUC:        0.0752

CONFUSION MATRIX:
  TN:   1992 | FP:     97
  FN:    100 | TP:     13


---

## LightGBM

In [19]:
lgb_study_b1 = optuna.load_study(study_name="lightgbm_optimization_days_b_1", storage=db_dir.format('lgb_study'))

b1_lgb_params = lgb_study_b1.best_params
b1_lgb_threshold = lgb_study_b1.best_trial.user_attrs.get('threshold', None)

lgb_model_b1 = lgb.LGBMClassifier(**b1_lgb_params)
lgb_model_b1.fit(X_np_b1, y_np_b1)


[LightGBM] [Info] Number of positive: 3869, number of negative: 50694
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4049
[LightGBM] [Info] Number of data points in the train set: 54563, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070909 -> initscore=-2.572811
[LightGBM] [Info] Start training from score -2.572811


0,1,2
,boosting_type,'gbdt'
,num_leaves,35
,max_depth,-1
,learning_rate,0.16814794125617394
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [20]:
# Classification with best threshold for LightGBM (b1)

print(f"Best LightGBM mean threshold: {b1_lgb_threshold}")

y_proba_np = lgb_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b1_lgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b1.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating LightGBM Model (b1):")
results_lgb_b1 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="LightGBM (b1)",
    plot_results=True,
    print_result=True
)


Best LightGBM mean threshold: 0.2389732979604365
Evaluating LightGBM Model (b1):



X does not have valid feature names, but LGBMClassifier was fitted with feature names




EVALUATION SUMMARY: LightGBM (b1)

CORE METRICS:
  Accuracy:      0.3726
  Precision:     0.0735
  Recall:        0.6554
  F1 Score:      0.1321
  Matthews CC:   0.0031

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5211
  PR AUC:        0.0788

CONFUSION MATRIX:
  TN:   5782 | FP:  10719
  FN:    447 | TP:    850


In [21]:
lgb_study_b2 = optuna.load_study(study_name="lightgbm_optimization_days_b_2", storage=db_dir.format('lgb_study'))

b2_lgb_params = lgb_study_b2.best_params
b2_lgb_threshold = lgb_study_b2.best_trial.user_attrs.get('threshold', None)

lgb_model_b2 = lgb.LGBMClassifier(**b2_lgb_params)
lgb_model_b2.fit(X_np_b2, y_np_b2)


[LightGBM] [Info] Number of positive: 1770, number of negative: 23667
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4789
[LightGBM] [Info] Number of data points in the train set: 25437, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.069584 -> initscore=-2.593102
[LightGBM] [Info] Start training from score -2.593102


0,1,2
,boosting_type,'gbdt'
,num_leaves,59
,max_depth,-1
,learning_rate,0.14802308897284228
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [22]:
# Classification with best threshold for LightGBM (b2)

print(f"Best LightGBM mean threshold: {b2_lgb_threshold}")

y_proba_np = lgb_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
y_pred_np = (y_proba_np > b2_lgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_b2.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating LightGBM Model (b2):")
results_lgb_b2 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="LightGBM (b2)",
    plot_results=True,
    print_result=True
)


Best LightGBM mean threshold: 0.18552032253327894
Evaluating LightGBM Model (b2):



X does not have valid feature names, but LGBMClassifier was fitted with feature names




EVALUATION SUMMARY: LightGBM (b2)

CORE METRICS:
  Accuracy:      0.6807
  Precision:     0.0610
  Recall:        0.3628
  F1 Score:      0.1045
  Matthews CC:   0.0291

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5616
  PR AUC:        0.0602

CONFUSION MATRIX:
  TN:   1458 | FP:    631
  FN:     72 | TP:     41


---

## RandomForest

In [23]:
rf_study_b1 = optuna.load_study(study_name="random_forest_optimization_days_b_1", storage=db_dir.format('rf_study'))

b1_rf_params = rf_study_b1.best_params

rf_model_b1 = RandomForestClassifier(**b1_rf_params)
rf_model_b1.fit(X_np_b1, y_np_b1)


0,1,2
,n_estimators,269
,criterion,'gini'
,max_depth,16
,min_samples_split,14
,min_samples_leaf,6
,min_weight_fraction_leaf,0.3013479079171481
,max_features,'sqrt'
,max_leaf_nodes,695
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
# Classification with best threshold for RandomForest (b1)

y_proba_np = rf_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_b1.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating RandomForest Model (b1):")
results_rf_b1 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="RandomForest (b1)",
    plot_results=True,
    print_result=True
)


Evaluating RandomForest Model (b1):



EVALUATION SUMMARY: RandomForest (b1)

CORE METRICS:
  Accuracy:      0.7513
  Precision:     0.0985
  Recall:        0.2961
  F1 Score:      0.1478
  Matthews CC:   0.0522

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5782
  PR AUC:        0.0936

CONFUSION MATRIX:
  TN:  12987 | FP:   3514
  FN:    913 | TP:    384


In [26]:
rf_study_b2 = optuna.load_study(study_name="random_forest_optimization_days_b_2", storage=db_dir.format('rf_study'))

b2_rf_params = rf_study_b2.best_params

rf_model_b2 = RandomForestClassifier(**b2_rf_params)
rf_model_b2.fit(X_np_b2, y_np_b2)


0,1,2
,n_estimators,972
,criterion,'gini'
,max_depth,3
,min_samples_split,7
,min_samples_leaf,9
,min_weight_fraction_leaf,0.16732343046149278
,max_features,'sqrt'
,max_leaf_nodes,37
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# Classification with best threshold for RandomForest (b2)

y_proba_np = rf_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_b2.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating RandomForest Model (b2):")
results_rf_b2 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="RandomForest (b2)",
    plot_results=True,
    print_result=True
)

Evaluating RandomForest Model (b2):



EVALUATION SUMMARY: RandomForest (b2)

CORE METRICS:
  Accuracy:      0.8601
  Precision:     0.1053
  Recall:        0.2301
  F1 Score:      0.1444
  Matthews CC:   0.0869

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6294
  PR AUC:        0.0881

CONFUSION MATRIX:
  TN:   1868 | FP:    221
  FN:     87 | TP:     26


---

## HistGradientBoost

In [29]:
histgb__study_b1 = optuna.load_study(study_name="histgb_optimization_days_b_1", storage=db_dir.format('histgb_study'))

histgb__xgb_params = histgb__study_b1.best_params
histgb__xgb_threshold = histgb__study_b1.best_trial.user_attrs.get('threshold', None)

histgb__model_b1 = HistGradientBoostingClassifier(**histgb__xgb_params,)
histgb__model_b1.fit(X_np_b1, y_np_b1)


0,1,2
,loss,'log_loss'
,learning_rate,0.2901475514215775
,max_iter,173
,max_leaf_nodes,31
,max_depth,9
,min_samples_leaf,10
,l2_regularization,0.06789886253741798
,max_features,1.0
,max_bins,110
,categorical_features,'from_dtype'


In [33]:
# Classification with best threshold for HistGradientBoost (b1)

y_proba_np = histgb__model_b1.predict_proba(X_b1.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_b1.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating HistGradientBoost Model (b1):")
results_hgb_b1 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="HistGradientBoost (b1)",
    plot_results=True,
    print_result=True
)


Evaluating HistGradientBoost Model (b1):



EVALUATION SUMMARY: HistGradientBoost (b1)

CORE METRICS:
  Accuracy:      0.9189
  Precision:     0.0651
  Recall:        0.0085
  F1 Score:      0.0150
  Matthews CC:   -0.0029

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5449
  PR AUC:        0.0810

CONFUSION MATRIX:
  TN:  16343 | FP:    158
  FN:   1286 | TP:     11


In [34]:
histgb__study_b2 = optuna.load_study(study_name="histgb_optimization_days_b_2", storage=db_dir.format('histgb_study'))

histgb__xgb_params = histgb__study_b2.best_params
histgb__xgb_threshold = histgb__study_b2.best_trial.user_attrs.get('threshold', None)

histgb__model_b2 = HistGradientBoostingClassifier(**histgb__xgb_params,)
histgb__model_b2.fit(X_np_b2, y_np_b2)


0,1,2
,loss,'log_loss'
,learning_rate,0.22352901585727447
,max_iter,100
,max_leaf_nodes,31
,max_depth,6
,min_samples_leaf,10
,l2_regularization,0.21854538462755402
,max_features,1.0
,max_bins,185
,categorical_features,'from_dtype'


In [35]:
# Classification with best threshold for HistGradientBoost (b2)

y_proba_np = histgb__model_b2.predict_proba(X_b2.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_b2.to_numpy().ravel())
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating HistGradientBoost Model (b2):")
results_hgb_b2 = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="HistGradientBoost (b2)",
    plot_results=True,
    print_result=True
)


Evaluating HistGradientBoost Model (b2):



EVALUATION SUMMARY: HistGradientBoost (b2)

CORE METRICS:
  Accuracy:      0.9482
  Precision:     0.0000
  Recall:        0.0000
  F1 Score:      0.0000
  Matthews CC:   -0.0050

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5669
  PR AUC:        0.0577

CONFUSION MATRIX:
  TN:   2088 | FP:      1
  FN:    113 | TP:      0


---

# Performance analysis

In [38]:
# Compare only "b1" model results using the compare_models function
b1_results = [
    results_xgboost_b1,
    results_catboost_b1,
    results_lgb_b1,
    results_rf_b1,
    results_hgb_b1
]

comparison_b1_df = compare_models(b1_results)
comparison_b1_df.sort('F1', descending=True)


Model,Accuracy,Precision,Recall,F1,Matthews_CC,ROC_AUC,PR_AUC
str,f64,f64,f64,f64,f64,f64,f64
"""RandomForest (b1)""",0.7513,0.0985,0.2961,0.1478,0.0522,0.5782,0.0936
"""LightGBM (b1)""",0.3726,0.0735,0.6554,0.1321,0.0031,0.5211,0.0788
"""CatBoost (b1)""",0.8126,0.0978,0.1912,0.1294,0.0391,0.5289,0.0818
"""XGBoost (b1)""",0.9175,0.1228,0.0216,0.0367,0.0219,0.5745,0.0945
"""HistGradientBoost (b1)""",0.9189,0.0651,0.0085,0.015,-0.0029,0.5449,0.081


In [39]:
# Compare only "b2" model results using the compare_models function
b2_results = [
    results_xgboost_b2,
    results_catboost_b2,
    results_lgb_b2,
    results_rf_b2,
    results_hgb_b2
]

comparison_b2_df = compare_models(b2_results)
comparison_b2_df.sort('F1', descending=True)


Model,Accuracy,Precision,Recall,F1,Matthews_CC,ROC_AUC,PR_AUC
str,f64,f64,f64,f64,f64,f64,f64
"""RandomForest (b2)""",0.8601,0.1053,0.2301,0.1444,0.0869,0.6294,0.0881
"""CatBoost (b2)""",0.9105,0.1182,0.115,0.1166,0.0695,0.6204,0.0752
"""XGBoost (b2)""",0.8624,0.0833,0.1681,0.1114,0.0493,0.6224,0.0827
"""LightGBM (b2)""",0.6807,0.061,0.3628,0.1045,0.0291,0.5616,0.0602
"""HistGradientBoost (b2)""",0.9482,0.0,0.0,0.0,-0.005,0.5669,0.0577


In [40]:
# Get prediction scores (probabilities) for each model's "b1" configuration on X_b1
lgb_proba_b1 = lgb_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
xgb_proba_b1 = xgb_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
cat_proba_b1 = cat_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
rf_proba_b1 = rf_model_b1.predict_proba(X_b1.to_numpy())[:, 1]
hgb_proba_b1 = histgb__model_b1.predict_proba(X_b1.to_numpy())[:, 1]

# Create a DataFrame with the scores
models_scores_b1 = pl.DataFrame({
    "lgb_score": lgb_proba_b1,
    "xgb_score": xgb_proba_b1,
    "cat_score": cat_proba_b1,
    "rf_score": rf_proba_b1,
    "hgb_score": hgb_proba_b1,
    "label": y_b1.to_numpy().ravel()
})

models_scores_b1 = models_scores_b1.with_columns([
    (pl.col("xgb_score") > b1_xgb_threshold).cast(pl.Int8).alias("xgb_pred"),
    (pl.col("cat_score") > b1_cat_threshold).cast(pl.Int8).alias("cat_pred"),
    (pl.col("lgb_score") > b1_lgb_threshold).cast(pl.Int8).alias("lgb_pred"),
    (pl.col("hgb_score") > (histgb__xgb_threshold if histgb__xgb_threshold is not None else 0.5)).cast(pl.Int8).alias("hgb__pred"),
    (pl.col("rf_score") > 0.5).cast(pl.Int8).alias("rf_pred"),
])




X does not have valid feature names, but LGBMClassifier was fitted with feature names



In [41]:
# Get prediction scores (probabilities) for each model's "b2" configuration on X_b2
lgb_proba_b2 = lgb_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
xgb_proba_b2 = xgb_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
cat_proba_b2 = cat_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
rf_proba_b2 = rf_model_b2.predict_proba(X_b2.to_numpy())[:, 1]
hgb_proba_b2 = histgb__model_b2.predict_proba(X_b2.to_numpy())[:, 1]

# Create a DataFrame with the scores
models_scores_b2 = pl.DataFrame({
    "lgb_score": lgb_proba_b2,
    "xgb_score": xgb_proba_b2,
    "cat_score": cat_proba_b2,
    "rf_score": rf_proba_b2,
    "hgb_score": hgb_proba_b2,
    "label": y_b2.to_numpy().ravel()
})

models_scores_b2 = models_scores_b2.with_columns([
    (pl.col("xgb_score") > b2_xgb_threshold).cast(pl.Int8).alias("xgb_pred"),
    (pl.col("cat_score") > b2_cat_threshold).cast(pl.Int8).alias("cat_pred"),
    (pl.col("lgb_score") > b2_lgb_threshold).cast(pl.Int8).alias("lgb_pred"),
    (pl.col("hgb_score") > (histgb__xgb_threshold if histgb__xgb_threshold is not None else 0.5)).cast(pl.Int8).alias("hgb__pred"),
    (pl.col("rf_score") > 0.5).cast(pl.Int8).alias("rf_pred"),
])


X does not have valid feature names, but LGBMClassifier was fitted with feature names





### B1

In [42]:
sampled = models_scores_b1.sample(300)

In [43]:
plot_model_scores(sampled)

### B2

In [44]:
sampled = models_scores_b2.sample(300)

In [45]:
plot_model_scores(sampled)

In [46]:
# Compute the correlation matrix of model_scores using Polars
correlation_matrix_b1 = models_scores_b1.select([col for col in models_scores_b1.columns if "_score" in col]).corr()

In [47]:
correlation_matrix_b1

lgb_score,xgb_score,cat_score,rf_score,hgb_score
f64,f64,f64,f64,f64
1.0,0.390900825757721,0.6912660608799509,0.3164768849912976,-0.0073941081574167
0.3909008257577209,1.0,0.3609792331145965,0.8874928613209423,0.3378596366814413
0.691266060879951,0.3609792331145965,0.9999999999999998,0.3325834984896195,-0.0875026563431784
0.3164768849912976,0.8874928613209423,0.3325834984896195,1.0,0.2969057876750294
-0.0073941081574167,0.3378596366814413,-0.0875026563431784,0.2969057876750294,1.0


In [48]:
fig = px.imshow(
    correlation_matrix_b1.to_numpy(),
    labels=dict(x='Model', y='Model', color='Correlation'),
    x=correlation_matrix_b1.columns,
    y=correlation_matrix_b1.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=800,
    height=600,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

In [49]:
# Compute the correlation matrix of model_scores using Polars
correlation_matrix_b2 = models_scores_b2.select([col for col in models_scores_b2.columns if "_score" in col]).corr()

In [50]:
correlation_matrix_b2

lgb_score,xgb_score,cat_score,rf_score,hgb_score
f64,f64,f64,f64,f64
1.0,0.4849847298712137,0.5065997392211541,0.3274980445398141,0.1840526636217543
0.4849847298712137,1.0,0.8077729489370967,0.8367232562999417,0.4308135431178263
0.5065997392211541,0.8077729489370966,1.0,0.6795125159671392,0.3103043723346663
0.3274980445398141,0.8367232562999418,0.6795125159671392,1.0,0.4064809471977929
0.1840526636217543,0.4308135431178264,0.3103043723346664,0.4064809471977929,0.9999999999999998


In [51]:
fig = px.imshow(
    correlation_matrix_b1.to_numpy(),
    labels=dict(x='Model', y='Model', color='Correlation'),
    x=correlation_matrix_b1.columns,
    y=correlation_matrix_b1.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=800,
    height=600,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

In [56]:
# For each row, check pairwise agreement/disagreement and the general decision (majority vote), including HistGradientBoosting for b2
opposite_decisions_b1 = models_scores_b1.with_columns([
    (pl.col("xgb_pred") != pl.col("cat_pred")).alias("opposite_decision_xgb_vs_cat"),
    (pl.col("xgb_pred") != pl.col("lgb_pred")).alias("opposite_decision_xgb_vs_lgb"),
    (pl.col("xgb_pred") != pl.col("rf_pred")).alias("opposite_decision_xgb_vs_rf"),
    (pl.col("xgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_xgb_vs_hgb"),
    (pl.col("cat_pred") != pl.col("lgb_pred")).alias("opposite_decision_cat_vs_lgb"),
    (pl.col("cat_pred") != pl.col("rf_pred")).alias("opposite_decision_cat_vs_rf"),
    (pl.col("cat_pred") != pl.col("hgb__pred")).alias("opposite_decision_cat_vs_hgb"),
    (pl.col("lgb_pred") != pl.col("rf_pred")).alias("opposite_decision_lgb_vs_rf"),
    (pl.col("lgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_lgb_vs_hgb"),
    (pl.col("rf_pred") != pl.col("hgb__pred")).alias("opposite_decision_rf_vs_hgb"),
    (pl.sum_horizontal(["xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred"]) >= 3).cast(pl.Int64).alias("majority_vote"),
    pl.Series("label", y_b1.to_numpy().ravel())
]).select([
    "label", "xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred",
    "opposite_decision_xgb_vs_cat",
    "opposite_decision_xgb_vs_lgb",
    "opposite_decision_xgb_vs_rf",
    "opposite_decision_xgb_vs_hgb",
    "opposite_decision_cat_vs_lgb",
    "opposite_decision_cat_vs_rf",
    "opposite_decision_cat_vs_hgb",
    "opposite_decision_lgb_vs_rf",
    "opposite_decision_lgb_vs_hgb",
    "opposite_decision_rf_vs_hgb",
    "majority_vote"
])

opposite_decisions_b1

label,xgb_pred,cat_pred,lgb_pred,rf_pred,hgb__pred,opposite_decision_xgb_vs_cat,opposite_decision_xgb_vs_lgb,opposite_decision_xgb_vs_rf,opposite_decision_xgb_vs_hgb,opposite_decision_cat_vs_lgb,opposite_decision_cat_vs_rf,opposite_decision_cat_vs_hgb,opposite_decision_lgb_vs_rf,opposite_decision_lgb_vs_hgb,opposite_decision_rf_vs_hgb,majority_vote
bool,i8,i8,i8,i8,i8,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,i64
true,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,1,1,1,0,true,true,true,false,false,false,true,false,true,true,1
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0


In [57]:
# Count how many times each pair of models give opposite decisions and who is correct in those cases, with percentages
def opposite_decision_stats(preds_df, y_true_col="label"):
    pairs = [
        ("xgb_pred", "cat_pred"),
        ("xgb_pred", "lgb_pred"),
        ("xgb_pred", "rf_pred"),
        ("xgb_pred", "hgb__pred"),
        ("cat_pred", "lgb_pred"),
        ("cat_pred", "rf_pred"),
        ("cat_pred", "hgb__pred"),
        ("lgb_pred", "rf_pred"),
        ("lgb_pred", "hgb__pred"),
        ("rf_pred", "hgb__pred"),
    ]
    n_total = preds_df.height
    stats = []
    for a, b in pairs:
        mask = preds_df[a] != preds_df[b]
        n_opposite = mask.sum()
        correct_a = ((preds_df[a] == preds_df[y_true_col]) & mask).sum()
        correct_b = ((preds_df[b] == preds_df[y_true_col]) & mask).sum()
        stats.append({
            "model_a": a,
            "model_b": b,
            "opposite_count": n_opposite,
            "opposite_pct": round(n_opposite / n_total * 100, 2),
            "model_a_correct": correct_a,
            "model_a_correct_pct": round(correct_a / n_opposite * 100, 2) if n_opposite > 0 else 0,
            "model_b_correct": correct_b,
            "model_b_correct_pct": round(correct_b / n_opposite * 100, 2) if n_opposite > 0 else 0,
        })
    return pl.DataFrame(stats)

opposite_stats_b1 = opposite_decision_stats(opposite_decisions_b1)
opposite_stats_b1

model_a,model_b,opposite_count,opposite_pct,model_a_correct,model_a_correct_pct,model_b_correct,model_b_correct_pct
str,str,i64,f64,i64,f64,i64,f64
"""xgb_pred""","""cat_pred""",2611,14.67,2239,85.75,372,14.25
"""xgb_pred""","""lgb_pred""",11435,64.25,10566,92.4,869,7.6
"""xgb_pred""","""rf_pred""",3738,21.0,3348,89.57,390,10.43
"""xgb_pred""","""hgb__pred""",379,2.13,177,46.7,202,53.3
"""cat_pred""","""lgb_pred""",9226,51.84,8528,92.43,698,7.57
"""cat_pred""","""rf_pred""",3655,20.54,2373,64.92,1282,35.08
"""cat_pred""","""hgb__pred""",2670,15.0,389,14.57,2281,85.43
"""lgb_pred""","""rf_pred""",9619,54.05,1440,14.97,8179,85.03
"""lgb_pred""","""hgb__pred""",11602,65.19,940,8.1,10662,91.9
"""rf_pred""","""hgb__pred""",3917,22.01,467,11.92,3450,88.08


In [58]:
# For each row, check pairwise agreement/disagreement and the general decision (majority vote), including HistGradientBoosting for b2
opposite_decisions_b2 = models_scores_b2.with_columns([
    (pl.col("xgb_pred") != pl.col("cat_pred")).alias("opposite_decision_xgb_vs_cat"),
    (pl.col("xgb_pred") != pl.col("lgb_pred")).alias("opposite_decision_xgb_vs_lgb"),
    (pl.col("xgb_pred") != pl.col("rf_pred")).alias("opposite_decision_xgb_vs_rf"),
    (pl.col("xgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_xgb_vs_hgb"),
    (pl.col("cat_pred") != pl.col("lgb_pred")).alias("opposite_decision_cat_vs_lgb"),
    (pl.col("cat_pred") != pl.col("rf_pred")).alias("opposite_decision_cat_vs_rf"),
    (pl.col("cat_pred") != pl.col("hgb__pred")).alias("opposite_decision_cat_vs_hgb"),
    (pl.col("lgb_pred") != pl.col("rf_pred")).alias("opposite_decision_lgb_vs_rf"),
    (pl.col("lgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_lgb_vs_hgb"),
    (pl.col("rf_pred") != pl.col("hgb__pred")).alias("opposite_decision_rf_vs_hgb"),
    (pl.sum_horizontal(["xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred"]) >= 3).cast(pl.Int64).alias("majority_vote"),
    pl.Series("label", y_b2.to_numpy().ravel())
]).select([
    "label", "xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred",
    "opposite_decision_xgb_vs_cat",
    "opposite_decision_xgb_vs_lgb",
    "opposite_decision_xgb_vs_rf",
    "opposite_decision_xgb_vs_hgb",
    "opposite_decision_cat_vs_lgb",
    "opposite_decision_cat_vs_rf",
    "opposite_decision_cat_vs_hgb",
    "opposite_decision_lgb_vs_rf",
    "opposite_decision_lgb_vs_hgb",
    "opposite_decision_rf_vs_hgb",
    "majority_vote"
])

opposite_decisions_b2

label,xgb_pred,cat_pred,lgb_pred,rf_pred,hgb__pred,opposite_decision_xgb_vs_cat,opposite_decision_xgb_vs_lgb,opposite_decision_xgb_vs_rf,opposite_decision_xgb_vs_hgb,opposite_decision_cat_vs_lgb,opposite_decision_cat_vs_rf,opposite_decision_cat_vs_hgb,opposite_decision_lgb_vs_rf,opposite_decision_lgb_vs_hgb,opposite_decision_rf_vs_hgb,majority_vote
bool,i8,i8,i8,i8,i8,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,i64
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,1,1,0,1,0,false,true,false,true,true,false,true,true,false,true,1
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0
false,0,0,0,1,0,false,false,true,false,false,true,false,true,false,true,0
false,0,0,0,0,0,false,false,false,false,false,false,false,false,false,false,0


In [59]:
# Count how many times each pair of models give opposite decisions and who is correct in those cases, with percentages
def opposite_decision_stats(preds_df, y_true_col="label"):
    pairs = [
        ("xgb_pred", "cat_pred"),
        ("xgb_pred", "lgb_pred"),
        ("xgb_pred", "rf_pred"),
        ("xgb_pred", "hgb__pred"),
        ("cat_pred", "lgb_pred"),
        ("cat_pred", "rf_pred"),
        ("cat_pred", "hgb__pred"),
        ("lgb_pred", "rf_pred"),
        ("lgb_pred", "hgb__pred"),
        ("rf_pred", "hgb__pred"),
    ]
    n_total = preds_df.height
    stats = []
    for a, b in pairs:
        mask = preds_df[a] != preds_df[b]
        n_opposite = mask.sum()
        correct_a = ((preds_df[a] == preds_df[y_true_col]) & mask).sum()
        correct_b = ((preds_df[b] == preds_df[y_true_col]) & mask).sum()
        stats.append({
            "model_a": a,
            "model_b": b,
            "opposite_count": n_opposite,
            "opposite_pct": round(n_opposite / n_total * 100, 2),
            "model_a_correct": correct_a,
            "model_a_correct_pct": round(correct_a / n_opposite * 100, 2) if n_opposite > 0 else 0,
            "model_b_correct": correct_b,
            "model_b_correct_pct": round(correct_b / n_opposite * 100, 2) if n_opposite > 0 else 0,
        })
    return pl.DataFrame(stats)

opposite_stats_b2 = opposite_decision_stats(opposite_decisions_b2)
opposite_stats_b2

model_a,model_b,opposite_count,opposite_pct,model_a_correct,model_a_correct_pct,model_b_correct,model_b_correct_pct
str,str,i64,f64,i64,f64,i64,f64
"""xgb_pred""","""cat_pred""",132,5.99,13,9.85,119,90.15
"""xgb_pred""","""lgb_pred""",606,27.52,503,83.0,103,17.0
"""xgb_pred""","""rf_pred""",139,6.31,72,51.8,67,48.2
"""xgb_pred""","""hgb__pred""",229,10.4,20,8.73,209,91.27
"""cat_pred""","""lgb_pred""",630,28.61,568,90.16,62,9.84
"""cat_pred""","""rf_pred""",189,8.58,150,79.37,39,20.63
"""cat_pred""","""hgb__pred""",111,5.04,14,12.61,97,87.39
"""lgb_pred""","""rf_pred""",647,29.38,126,19.47,521,80.53
"""lgb_pred""","""hgb__pred""",673,30.56,42,6.24,631,93.76
"""rf_pred""","""hgb__pred""",248,11.26,27,10.89,221,89.11


looking at this stats catboost and xgboost are a good combination

In [None]:
# TODO: revwrite the message

**Key Insights:**

Random Forest is severely overfitting - Perfect recall (1.0) and precision (1.0) with 0 Matthews Correlation Coefficient suggests it's memorizing the training data rather than learning generalizable patterns.

**CatBoost (base) is actually your best performer:**

- Highest accuracy: 0.8408
- Best balance across metrics
- Reasonable Matthews CC: 0.0294
- Good ROC_AUC: 0.5521


XGBoost (base) is second best with solid balanced performance across metrics.

In [None]:
import pickle

artifacts_dir = data_dir / 'models/artifacts'
artifacts_dir.mkdir(parents=True, exist_ok=True)

# Save models for b1
with open(artifacts_dir / 'pre_xgb_days_b1.pkl', 'wb') as f:
    pickle.dump(xgb_model_b1, f)
with open(artifacts_dir / 'pre_cat_days_b1.pkl', 'wb') as f:
    pickle.dump(cat_model_b1, f)
with open(artifacts_dir / 'pre_lgb_days_b1.pkl', 'wb') as f:
    pickle.dump(lgb_model_b1, f)
with open(artifacts_dir / 'pre_rf_days_b1.pkl', 'wb') as f:
    pickle.dump(rf_model_b1, f)
with open(artifacts_dir / 'pre_histgb_days_b1.pkl', 'wb') as f:
    pickle.dump(histgb__model_b1, f)

# Save models for b2
with open(artifacts_dir / 'pre_xgb_days_b2.pkl', 'wb') as f:
    pickle.dump(xgb_model_b2, f)
with open(artifacts_dir / 'pre_cat_days_b2.pkl', 'wb') as f:
    pickle.dump(cat_model_b2, f)
with open(artifacts_dir / 'pre_lgb_days_b2.pkl', 'wb') as f:
    pickle.dump(lgb_model_b2, f)
with open(artifacts_dir / 'pre_rf_days_b2.pkl', 'wb') as f:
    pickle.dump(rf_model_b2, f)
with open(artifacts_dir / 'pre_histgb_days_b2.pkl', 'wb') as f:
    pickle.dump(histgb__model_b2, f)