# Imports and definitions

In [22]:
from pathlib import Path

import polars as pl
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc,
    confusion_matrix, classification_report, matthews_corrcoef
)

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import optuna

_ = pl.Config.set_tbl_cols(None)
_ = pl.Config.set_fmt_str_lengths(500)
_ = pl.Config.set_fmt_float("full")

In [23]:
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='sklearn')

In [24]:
base_dir = Path('/Users/danlab/code/magenta-task/')
code_dir = base_dir / 'notebooks'
data_dir = code_dir / "data"
features_dir = data_dir / 'features'
train_dir = data_dir / 'train'
test_dir = data_dir / 'test'
db_dir = 'sqlite:///data/models/{}.db'

In [25]:
def evaluate_classification_model(y_true, y_pred, y_proba=None,
                                  model_name="Model", pos_label=1,
                                  plot_results=True, print_result=True):
    """
    Comprehensive evaluation function for classification models.

    Parameters:
    -----------
    y_true : Polars Series or array-like
        True labels
    y_pred : Polars Series or array-like
        Predicted labels
    y_proba : Polars Series or array-like, optional
        Predicted probabilities for positive class
    model_name : str
        Name of the model for reporting
    pos_label : int or str
        Label of positive class
    plot_results : bool
        Whether to generate plots
    bootstrap_ci : bool
        Whether to compute bootstrap confidence intervals
    n_bootstrap : int
        Number of bootstrap samples
    confidence_level : float
        Confidence level for intervals

    Returns:
    --------
    dict : Dictionary containing all evaluation metrics
    """

    y_true_np = y_true.to_numpy()
    y_pred_np = y_pred.to_numpy() 
    y_proba_np = y_proba.to_numpy()

    results = {'model_name': model_name}

    results['accuracy'] = accuracy_score(y_true_np, y_pred_np)
    results['precision'] = precision_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['recall'] = recall_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['f1_score'] = f1_score(y_true_np, y_pred_np, pos_label=pos_label, average='binary')
    results['matthews_corr'] = matthews_corrcoef(y_true_np, y_pred_np)

    cm = confusion_matrix(y_true_np, y_pred_np)
    results['confusion_matrix'] = cm
    results['tn'], results['fp'], results['fn'], results['tp'] = cm.ravel()

    if y_proba_np is not None:
        results['roc_auc'] = roc_auc_score(y_true_np, y_proba_np)
        results['pr_auc'] = auc(*precision_recall_curve(y_true_np, y_proba_np)[:2][::-1])

    # Generate plots
    if plot_results:
        plot_evaluation_results(y_true_np, y_pred_np, y_proba_np, model_name, results)

    if print_result:
        print_evaluation_summary(results)

    return results


def plot_evaluation_results(y_true, y_pred, y_proba, model_name, results):
    """Generate comprehensive evaluation plots using Plotly."""
    figures = []

    # Confusion Matrix
    cm = results['confusion_matrix']
    fig_cm = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Blues',
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 20}
    ))
    fig_cm.update_layout(
        title=f'Confusion Matrix: {model_name}',
        xaxis_title='Predicted',
        yaxis_title='Actual',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=500
    )
    figures.append(fig_cm)

    # Classification Report as heatmap
    report_dict = classification_report(y_true, y_pred, output_dict=True)
    selected_classes = [key for key in report_dict if key not in ['accuracy', 'macro avg', 'weighted avg']]
    report_data = {
        'Metric': ['precision', 'recall', 'f1-score']
    }
    for cls in selected_classes:
        report_data[cls] = [
            report_dict[cls]['precision'],
            report_dict[cls]['recall'],
            report_dict[cls]['f1-score']
        ]

    df_report = pl.DataFrame(report_data)
    report_index = ['precision', 'recall', 'f1-score']
    df_report = df_report.with_columns(
        pl.Series("Metric", report_index).alias("Metric")
    )
    df_report = df_report.select(pl.col("Metric"), pl.exclude("Metric"))

    for col in df_report.columns:
        if col != "Metric":
            df_report = df_report.with_columns(pl.col(col).cast(pl.Float64))

    z_data = df_report.drop("Metric").to_numpy()
    x_labels = df_report.drop("Metric").columns
    y_labels = df_report["Metric"].to_list()

    fig_cr = go.Figure(data=go.Heatmap(
        z=z_data,
        x=x_labels,
        y=y_labels,
        colorscale='RdYlBu_r',
        text=np.around(z_data, decimals=3),
        texttemplate="%{text}",
        textfont={"size": 14}
    ))
    fig_cr.update_layout(
        title=f'Classification Report: {model_name}',
        xaxis_title='Class',
        yaxis_title='Metric',
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        height=400, width=600
    )
    figures.append(fig_cr)

    if y_proba is not None:
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_true, y_proba)
        fig_roc = go.Figure()
        fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines',
                                     name=f'ROC (AUC = {results["roc_auc"]:.3f})',
                                     line=dict(width=2)))
        fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                                     name='Random Classifier',
                                     line=dict(dash='dash', color='grey')))
        fig_roc.update_layout(
            title=f'ROC Curve: {model_name}',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_roc)

        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_true, y_proba)
        fig_pr = go.Figure()
        fig_pr.add_trace(go.Scatter(x=recall, y=precision, mode='lines',
                                    name=f'PR (AUC = {results["pr_auc"]:.3f})',
                                    line=dict(width=2)))
        fig_pr.add_trace(go.Scatter(x=[0, 1], y=[np.mean(y_true), np.mean(y_true)], mode='lines',
                                    name='Baseline',
                                    line=dict(dash='dash', color='grey')))
        fig_pr.update_layout(
            title=f'Precision-Recall Curve: {model_name}',
            xaxis_title='Recall',
            yaxis_title='Precision',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_pr)

        # Prediction Distribution
        fig_dist = go.Figure()
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 0], name='Negative Class',
                                        marker_color='red', opacity=0.6, histnorm='probability density'))
        fig_dist.add_trace(go.Histogram(x=y_proba[y_true == 1], name='Positive Class',
                                        marker_color='blue', opacity=0.6, histnorm='probability density'))
        fig_dist.update_layout(
            title=f'Prediction Distribution: {model_name}',
            xaxis_title='Predicted Probability',
            yaxis_title='Density',
            barmode='overlay',
            hovermode='x unified',
            height=450, width=600
        )
        figures.append(fig_dist)

    for fig in figures:
        fig.show()

    return figures

def print_evaluation_summary(results):
    """Print a formatted summary of evaluation results."""
    print(f"\n{'='*60}")
    print(f"EVALUATION SUMMARY: {results['model_name']}")
    print(f"{'='*60}")

    print(f"\nCORE METRICS:")
    print(f"  Accuracy:      {results['accuracy']:.4f}")
    print(f"  Precision:     {results['precision']:.4f}")
    print(f"  Recall:        {results['recall']:.4f}")
    print(f"  F1 Score:      {results['f1_score']:.4f}")
    print(f"  Matthews CC:   {results['matthews_corr']:.4f}")

    if 'roc_auc' in results:
        print(f"\nPROBABILITY-BASED METRICS:")
        print(f"  ROC AUC:       {results['roc_auc']:.4f}")
        print(f"  PR AUC:        {results['pr_auc']:.4f}")

    print(f"\nCONFUSION MATRIX:")
    print(f"  TN: {results['tn']:>6} | FP: {results['fp']:>6}")
    print(f"  FN: {results['fn']:>6} | TP: {results['tp']:>6}")
    

def compare_models(models_results):
    """Compare multiple models and return a comparison DataFrame."""
    comparison_data = []

    for result in models_results:
        row = {
            'Model': result['model_name'],
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1': result['f1_score'],
            'Matthews_CC': result['matthews_corr']
        }

        if 'roc_auc' in result:
            row.update({
                'ROC_AUC': result['roc_auc'],
                'PR_AUC': result['pr_auc']
            })

        comparison_data.append(row)

    comparison_df = pl.DataFrame(comparison_data)
    float_cols = [col for col, dtype in comparison_df.schema.items() if dtype == pl.Float64]
    comparison_df = comparison_df.with_columns([
        pl.col(col).round(4) for col in float_cols
    ])
    return comparison_df

In [56]:
def plot_model_scores(data):
    """
    Plot model scores (XGBoost, CatBoost, RandomForest, LightGBM, HistGradientBoosting) as lines and correctness as colored markers.

    Args:
        data (pl.DataFrame): DataFrame with columns ['xgb_score', 'cat_score', 'rf_score', 'lgb_score', 'hgb_score', 'xgb_pred', 'cat_pred', 'rf_pred', 'lgb_pred', 'hgb__pred'].
    """

    x = data.with_row_index()["index"].to_numpy()
    # Try to get label column, fallback to 'label' if present, else None
    label = data["label"].to_numpy() if "label" in data.columns else None

    def get_colors(pred_col):
        preds = data[pred_col].to_numpy()
        return np.where(preds == label, "green", "red")

    fig = make_subplots(
        rows=5, cols=1, shared_xaxes=True,
        subplot_titles=[
            "XGBoost Score", "CatBoost Score", "RandomForest Score", "LightGBM Score", "HistGradientBoosting Score"
        ]
    )

    # XGBoost
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["xgb_score"].to_numpy(),
            mode='lines',
            name='XGBoost Line',
            line=dict(color='royalblue'),
            showlegend=True
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["xgb_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("xgb_pred")),
            name='XGBoost Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=1, col=1
    )

    # CatBoost
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["cat_score"].to_numpy(),
            mode='lines',
            name='CatBoost Line',
            line=dict(color='orange'),
            showlegend=True
        ),
        row=2, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["cat_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("cat_pred")),
            name='CatBoost Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=2, col=1
    )

    # RandomForest
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["rf_score"].to_numpy(),
            mode='lines',
            name='RandomForest Line',
            line=dict(color='green'),
            showlegend=True
        ),
        row=3, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["rf_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("rf_pred")),
            name='RandomForest Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=3, col=1
    )

    # LightGBM
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["lgb_score"].to_numpy(),
            mode='lines',
            name='LightGBM Line',
            line=dict(color='purple'),
            showlegend=True
        ),
        row=4, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["lgb_score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("lgb_pred")),
            name='LightGBM Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=4, col=1
    )

    # HistGradientBoosting
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["hgb_score"].to_numpy() if "hgb_score" in data.columns else data["hgb__score"].to_numpy(),
            mode='lines',
            name='HistGradientBoosting Line',
            line=dict(color='#FFA15A'),
            showlegend=True
        ),
        row=5, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=data["hgb_score"].to_numpy() if "hgb_score" in data.columns else data["hgb__score"].to_numpy(),
            mode='markers',
            marker=dict(color=get_colors("hgb__pred")),
            name='HistGradientBoosting Correct/Incorrect',
            hovertemplate="index: %{x}<br>score: %{y:.3f}<br>label: %{customdata}",
            customdata=label,
            showlegend=True
        ),
        row=5, col=1
    )

    fig.update_layout(height=1500, width=1600, title_text="Model Scores: Line + Correctness Scatter")
    fig.show()

# Load data

In [27]:
%%time

train = pl.read_parquet(train_dir / 'data-v0-80.parquet')

CPU times: user 13 ms, sys: 7.7 ms, total: 20.7 ms
Wall time: 22.5 ms


In [28]:
%%time

test = pl.read_parquet(test_dir / 'data-v0-20.parquet')

CPU times: user 4.83 ms, sys: 4.23 ms, total: 9.06 ms
Wall time: 11.1 ms


# Prepare data

In [29]:
X_train = train.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y_train = train.select('has_done_upselling')

In [30]:
# Testing on hold out test

X = test.select(pl.exclude(['rating_account_id', 'customer_id', 'has_done_upselling']))
y = test.select('has_done_upselling')

In [31]:
X_np = X_train.to_numpy()
y_np = y_train.to_numpy().ravel()

y_true_np = y.to_numpy().ravel()

# Evaluation

## XGBoost

In [32]:
xgb_study_best = optuna.load_study(study_name="xgboost_optimization", storage=db_dir.format('xgb_study'))

best_xgb_params = xgb_study_best.best_params
best_xgb_threshold = xgb_study_best.best_trial.user_attrs.get('threshold', None)

xgb_model_best = xgb.XGBClassifier(**best_xgb_params,)
xgb_model_best.fit(X_np, y_np)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9555845404311799
,device,
,early_stopping_rounds,
,enable_categorical,False


In [33]:
# Classification with best threshold

print(f"Best XGBoost mean threshold: {best_xgb_threshold}")
y_proba_np = xgb_model_best.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > best_xgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)


# Evaluate a single model
print("Evaluating XGBoost Model (best):")
results_xgboost_best = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="XGBoost (best)",
    plot_results=True,
    print_result=True
)

Best XGBoost mean threshold: 0.6058635592460633
Evaluating XGBoost Model (best):



EVALUATION SUMMARY: XGBoost (best)

CORE METRICS:
  Accuracy:      0.7401
  Precision:     0.0958
  Recall:        0.3184
  F1 Score:      0.1473
  Matthews CC:   0.0547

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5670
  PR AUC:        0.0874

CONFUSION MATRIX:
  TN:  14354 | FP:   4236
  FN:    961 | TP:    449


In [34]:
xgb_study_base = optuna.load_study(study_name="xgboost_optimization_basef1", storage=db_dir.format('xgb_study'))

best_xgb_params = xgb_study_base.best_params

xgb_model_base = xgb.XGBClassifier(**best_xgb_params,)
xgb_model_base.fit(X_np, y_np)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9796706593574434
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
# Base classification with threshold 0.5

y_proba_np = xgb_model_base.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)


# Evaluate a single model
print("Evaluating XGBoost Model (base):")
results_xgboost_base = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="XGBoost (base)",
    plot_results=True,
    print_result=True
)

Evaluating XGBoost Model (base):



EVALUATION SUMMARY: XGBoost (base)

CORE METRICS:
  Accuracy:      0.8252
  Precision:     0.0857
  Recall:        0.1532
  F1 Score:      0.1100
  Matthews CC:   0.0226

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5403
  PR AUC:        0.0799

CONFUSION MATRIX:
  TN:  16287 | FP:   2303
  FN:   1194 | TP:    216


---

## Catboost

In [36]:

cat_study_base = optuna.load_study(study_name="catboost_optimization", storage=db_dir.format('cat_study'))

best_cat_params = cat_study_base.best_params
best_cat_threshold = cat_study_base.best_trial.user_attrs.get('threshold', None)

cat_model_best = CatBoostClassifier(**best_cat_params, verbose=0)
cat_model_best.fit(X_np, y_np)


<catboost.core.CatBoostClassifier at 0x3123e3740>

In [37]:
# Classification with best threshold for CatBoost

print(f"Best CatBoost mean threshold: {best_cat_threshold}")

y_proba_np = cat_model_best.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > best_cat_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating CatBoost Model (best):")
results_catboost_best = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="CatBoost (best)",
    plot_results=True,
    print_result=True
)


Best CatBoost mean threshold: 0.5180660344888779
Evaluating CatBoost Model (best):



EVALUATION SUMMARY: CatBoost (best)

CORE METRICS:
  Accuracy:      0.7077
  Precision:     0.1008
  Recall:        0.3972
  F1 Score:      0.1608
  Matthews CC:   0.0734

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5997
  PR AUC:        0.0969

CONFUSION MATRIX:
  TN:  13594 | FP:   4996
  FN:    850 | TP:    560


In [38]:
cat_study_base = optuna.load_study(study_name="catboost_optimization_basef1", storage=db_dir.format('cat_study'))

best_cat_params = cat_study_base.best_params

cat_model_base = CatBoostClassifier(**best_cat_params, verbose=0)
cat_model_base.fit(X_np, y_np)


<catboost.core.CatBoostClassifier at 0x324c49eb0>

In [39]:
# Base classification with threshold 0.5 for CatBoost

y_proba_np = cat_model_base.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating CatBoost Model (base):")
results_catboost_base = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="CatBoost (base)",
    plot_results=True,
    print_result=True
)


Evaluating CatBoost Model (base):



EVALUATION SUMMARY: CatBoost (base)

CORE METRICS:
  Accuracy:      0.7679
  Precision:     0.0937
  Recall:        0.2645
  F1 Score:      0.1384
  Matthews CC:   0.0452

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5724
  PR AUC:        0.0889

CONFUSION MATRIX:
  TN:  14984 | FP:   3606
  FN:   1037 | TP:    373


---

## LightGBM

In [40]:
lgb_study_best = optuna.load_study(study_name="lightgbm_optimization", storage=db_dir.format('lgb_study'))

best_lgb_params = lgb_study_best.best_params
best_lgb_threshold = lgb_study_best.best_trial.user_attrs.get('threshold', None)

lgb_model_best = lgb.LGBMClassifier(**best_lgb_params)
lgb_model_best.fit(X_np, y_np)


[LightGBM] [Info] Number of positive: 5639, number of negative: 74361
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5046
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070487 -> initscore=-2.579225
[LightGBM] [Info] Start training from score -2.579225


0,1,2
,boosting_type,'gbdt'
,num_leaves,26
,max_depth,-1
,learning_rate,0.08528860622379723
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [41]:
# Classification with best threshold for LightGBM

print(f"Best LightGBM mean threshold: {best_lgb_threshold}")

y_proba_np = lgb_model_best.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > best_lgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating LightGBM Model (best):")
results_lgb_best = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="LightGBM (best)",
    plot_results=True,
    print_result=True
)


Best LightGBM mean threshold: 0.1158464662717178
Evaluating LightGBM Model (best):



X does not have valid feature names, but LGBMClassifier was fitted with feature names




EVALUATION SUMMARY: LightGBM (best)

CORE METRICS:
  Accuracy:      0.0927
  Precision:     0.0720
  Recall:        0.9979
  F1 Score:      0.1343
  Matthews CC:   0.0378

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6051
  PR AUC:        0.0964

CONFUSION MATRIX:
  TN:    447 | FP:  18143
  FN:      3 | TP:   1407


In [42]:
lgb_study_base = optuna.load_study(study_name="lightgbm_optimization_basef1", storage=db_dir.format('lgb_study'))

best_lgb_params = lgb_study_base.best_params

lgb_model_base = lgb.LGBMClassifier(**best_lgb_params)
lgb_model_base.fit(X_np, y_np)


[LightGBM] [Info] Number of positive: 5639, number of negative: 74361
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5046
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070487 -> initscore=-2.579225
[LightGBM] [Info] Start training from score -2.579225


0,1,2
,boosting_type,'gbdt'
,num_leaves,159
,max_depth,-1
,learning_rate,0.28054343791849756
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [43]:
# Base classification with threshold 0.5 for LightGBM

y_proba_np = lgb_model_base.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating LightGBM Model (base):")
results_lgb_base = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="LightGBM (base)",
    plot_results=True,
    print_result=True
)


Evaluating LightGBM Model (base):



X does not have valid feature names, but LGBMClassifier was fitted with feature names




EVALUATION SUMMARY: LightGBM (base)

CORE METRICS:
  Accuracy:      0.8210
  Precision:     0.0899
  Recall:        0.1688
  F1 Score:      0.1174
  Matthews CC:   0.0297

PROBABILITY-BASED METRICS:
  ROC AUC:       0.5571
  PR AUC:        0.0849

CONFUSION MATRIX:
  TN:  16182 | FP:   2408
  FN:   1172 | TP:    238


---

## RandomForest

In [44]:
rf_study_best = optuna.load_study(study_name="random_forest_optimization", storage=db_dir.format('rf_study'))

best_rf_params = rf_study_best.best_params
best_rf_threshold = rf_study_best.best_trial.user_attrs.get('threshold', None)

rf_model_best = RandomForestClassifier(**best_rf_params)
rf_model_best.fit(X_np, y_np)


0,1,2
,n_estimators,762
,criterion,'gini'
,max_depth,16
,min_samples_split,16
,min_samples_leaf,5
,min_weight_fraction_leaf,0.03592884702326953
,max_features,'sqrt'
,max_leaf_nodes,978
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
# Classification with best threshold for RandomForest

print(f"Best RandomForest mean threshold: {best_rf_threshold}")

y_proba_np = rf_model_best.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > best_rf_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating RandomForest Model (best):")
results_rf_best = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="RandomForest (best)",
    plot_results=True,
    print_result=True,
)


Best RandomForest mean threshold: 0.16985384381211258
Evaluating RandomForest Model (best):



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




EVALUATION SUMMARY: RandomForest (best)

CORE METRICS:
  Accuracy:      0.0705
  Precision:     0.0705
  Recall:        1.0000
  F1 Score:      0.1317
  Matthews CC:   0.0000

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6117
  PR AUC:        0.0979

CONFUSION MATRIX:
  TN:      0 | FP:  18590
  FN:      0 | TP:   1410


In [50]:
rf_study_base = optuna.load_study(study_name="random_forest_optimization_basef1", storage=db_dir.format('rf_study'))

best_rf_params = rf_study_base.best_params

rf_model_base = RandomForestClassifier(**best_rf_params)
rf_model_base.fit(X_np, y_np)

0,1,2
,n_estimators,864
,criterion,'gini'
,max_depth,19
,min_samples_split,19
,min_samples_leaf,1
,min_weight_fraction_leaf,0.015327799848051129
,max_features,'sqrt'
,max_leaf_nodes,154
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
# Base classification with threshold 0.5 for RandomForest

y_proba_np = rf_model_base.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating RandomForest Model (base):")
results_rf_base = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="RandomForest (base)",
    plot_results=True,
    print_result=True
)


Evaluating RandomForest Model (base):



EVALUATION SUMMARY: RandomForest (base)

CORE METRICS:
  Accuracy:      0.6443
  Precision:     0.0967
  Recall:        0.4851
  F1 Score:      0.1613
  Matthews CC:   0.0758

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6105
  PR AUC:        0.0979

CONFUSION MATRIX:
  TN:  12202 | FP:   6388
  FN:    726 | TP:    684


---

## HistGradientBoost

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

histgb_study = optuna.load_study(study_name="histgb_optimization", storage=db_dir.format('histgb_study'))
best_hgb_params = histgb_study.best_params
best_hgb_threshold = histgb_study.best_trial.user_attrs.get('threshold', None)

hgb_model_best = HistGradientBoostingClassifier(**best_hgb_params)
hgb_model_best.fit(X_np, y_np)


0,1,2
,loss,'log_loss'
,learning_rate,0.011946188388124244
,max_iter,122
,max_leaf_nodes,31
,max_depth,3
,min_samples_leaf,88
,l2_regularization,0.4294702816204709
,max_features,1.0
,max_bins,231
,categorical_features,'from_dtype'


In [None]:
# Classification with best threshold for HistGradientBoosting

print(f"Best HistGradientBoosting mean threshold: {best_hgb_threshold}")

y_proba_np = hgb_model_best.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > best_hgb_threshold).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating HistGradientBoosting Model (best):")
results_hgb_best = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="HistGradientBoosting (best)",
    plot_results=True,
    print_result=True
)


Best HistGradientBoosting mean threshold: 0.07893814294080972
Evaluating HistGradientBoosting Model (best):



EVALUATION SUMMARY: HistGradientBoosting (best)

CORE METRICS:
  Accuracy:      0.6839
  Precision:     0.0963
  Recall:        0.4156
  F1 Score:      0.1564
  Matthews CC:   0.0667

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6079
  PR AUC:        0.0961

CONFUSION MATRIX:
  TN:  13093 | FP:   5497
  FN:    824 | TP:    586


In [None]:
histgb_study_base = optuna.load_study(study_name="histgb_optimization_basef1", storage=db_dir.format('histgb_study'))

base_hgb_params = histgb_study_base.best_params

hgb_model_base = HistGradientBoostingClassifier(**base_hgb_params)
hgb_model_base.fit(X_np, y_np)


0,1,2
,loss,'log_loss'
,learning_rate,0.031198398220887345
,max_iter,243
,max_leaf_nodes,31
,max_depth,4
,min_samples_leaf,100
,l2_regularization,0.6668262373207421
,max_features,1.0
,max_bins,92
,categorical_features,'from_dtype'


In [None]:
# Base classification with threshold 0.5 for RandomForest

y_proba_np = hgb_model_base.predict_proba(X.to_numpy())[:, 1]
y_pred_np = (y_proba_np > 0.5).astype(int)

y_true_pl = pl.Series("y_true", y_true_np)
y_pred_pl = pl.Series("y_pred", y_pred_np)
y_proba_pl = pl.Series("y_proba", y_proba_np)

# Evaluate a single model
print("Evaluating HistGradientBoosting Model (base):")
results_hgb_base = evaluate_classification_model(
    y_true=y_true_pl,
    y_pred=y_pred_pl,
    y_proba=y_proba_pl,
    model_name="HistGradientBoosting (base)",
    plot_results=True,
    print_result=True
)


Evaluating HistGradientBoosting Model (base):



Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




EVALUATION SUMMARY: HistGradientBoosting (base)

CORE METRICS:
  Accuracy:      0.9295
  Precision:     0.0000
  Recall:        0.0000
  F1 Score:      0.0000
  Matthews CC:   0.0000

PROBABILITY-BASED METRICS:
  ROC AUC:       0.6127
  PR AUC:        0.0994

CONFUSION MATRIX:
  TN:  18590 | FP:      0
  FN:   1410 | TP:      0


---

# Performance analysis

In [69]:
# Compare all model results using the compare_models function
all_results = [
    results_xgboost_base,
    results_xgboost_best,
    results_catboost_base,
    results_catboost_best,
    results_lgb_base,
    results_lgb_best,
    results_rf_base,
    results_rf_best,
    results_hgb_best,     
    results_hgb_base
]

comparison_df = compare_models(all_results)
comparison_df.sort('F1', descending=True)

Model,Accuracy,Precision,Recall,F1,Matthews_CC,ROC_AUC,PR_AUC
str,f64,f64,f64,f64,f64,f64,f64
"""RandomForest (base)""",0.6443,0.0967,0.4851,0.1613,0.0758,0.6105,0.0979
"""CatBoost (best)""",0.7077,0.1008,0.3972,0.1608,0.0734,0.5997,0.0969
"""HistGradientBoosting (best)""",0.6839,0.0963,0.4156,0.1564,0.0667,0.6079,0.0961
"""XGBoost (best)""",0.7402,0.0958,0.3184,0.1473,0.0547,0.567,0.0874
"""CatBoost (base)""",0.7678,0.0937,0.2645,0.1384,0.0452,0.5724,0.0889
"""LightGBM (best)""",0.0927,0.072,0.9979,0.1343,0.0378,0.6051,0.0964
"""RandomForest (best)""",0.0705,0.0705,1.0,0.1317,0.0,0.6117,0.0979
"""LightGBM (base)""",0.821,0.0899,0.1688,0.1174,0.0297,0.5571,0.0849
"""XGBoost (base)""",0.8252,0.0857,0.1532,0.11,0.0226,0.5403,0.0799
"""HistGradientBoosting (base)""",0.9295,0.0,0.0,0.0,0.0,0.6127,0.0994


xgboost best is better

catboost best is better

lightgbm best is better but is a stupid model because has recall almost 1 and accuracy 0 -> lightgbm base

random forest base is better

-> catboost best has the higher f1, then random forest base and then xgboost, also the recall follows the same

In [53]:
# Get prediction scores (probabilities) for each model's "best" configuration on X
lgb_proba_best = lgb_model_best.predict_proba(X.to_numpy())[:, 1]
xgb_proba_best = xgb_model_best.predict_proba(X.to_numpy())[:, 1]
cat_proba_best = cat_model_best.predict_proba(X.to_numpy())[:, 1]
rf_proba_base = rf_model_base.predict_proba(X.to_numpy())[:, 1]
hgb_proba_best = hgb_model_best.predict_proba(X.to_numpy())[:, 1]

# Create a DataFrame with the scores
models_scores = pl.DataFrame({
    "lgb_score": lgb_proba_best,
    "xgb_score": xgb_proba_best,
    "cat_score": cat_proba_best,
    "rf_score": rf_proba_base,
    "hgb_score": hgb_proba_best,
    "label": y_true_np
})

models_scores = models_scores.with_columns([
    (pl.col("xgb_score") > best_xgb_threshold).cast(pl.Int8).alias("xgb_pred"),
    (pl.col("cat_score") > best_cat_threshold).cast(pl.Int8).alias("cat_pred"),
    (pl.col("lgb_score") > best_lgb_threshold).cast(pl.Int8).alias("lgb_pred"),
    (pl.col("hgb_score") > best_hgb_threshold).cast(pl.Int8).alias("hgb__pred"),
    (pl.col("rf_score") > .5).cast(pl.Int8).alias("rf_pred"),
])


X does not have valid feature names, but LGBMClassifier was fitted with feature names





In [54]:
fig = px.bar(models_scores.sample(150).with_row_index(), x="index", y=["xgb_score", "cat_score", "rf_score", 'lgb_score', 'hgb_score'], title="Model scores comparison", barmode='group')
fig.show()

In [57]:
sampled = models_scores.sample(300)

In [58]:
plot_model_scores(sampled)

In [59]:
# Compute the correlation matrix of model_scores using Polars
correlation_matrix = models_scores.select([col for col in models_scores.columns if "_score" in col]).corr()

In [60]:
correlation_matrix

lgb_score,xgb_score,cat_score,rf_score,hgb_score
f64,f64,f64,f64,f64
1.0,0.6962934514910264,0.8684875277797787,0.870714476644969,0.8341415543949054
0.6962934514910264,1.0,0.7634113461825504,0.6121847232855314,0.5687534342530333
0.8684875277797787,0.7634113461825502,1.0,0.8518661915039726,0.8052098549731295
0.8707144766449689,0.6121847232855314,0.8518661915039726,1.0,0.9284443350785097
0.8341415543949053,0.5687534342530332,0.8052098549731295,0.9284443350785094,1.0


In [120]:
fig = px.imshow(
    correlation_matrix.to_numpy(),
    labels=dict(x='Model', y='Model', color='Correlation'),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale='RdBu',
    zmin=-1, zmax=1,
    aspect='auto'
)
fig.update_layout(
    width=800,
    height=600,
    title='Correlation Matrix Heatmap'
)
fig.update_layout(title='Correlation Matrix Heatmap')
fig.show()

# Blue means that variable X and variable Y follow the same behaviour (both increasing or decreasing)
# Red means that variable X has the opposite behaviour of variable Y

In [62]:
# For each row, check pairwise agreement/disagreement and the general decision (majority vote), including HistGradientBoosting
opposite_decisions = models_scores.with_columns([
    (pl.col("xgb_pred") != pl.col("cat_pred")).alias("opposite_decision_xgb_vs_cat"),
    (pl.col("xgb_pred") != pl.col("lgb_pred")).alias("opposite_decision_xgb_vs_lgb"),
    (pl.col("xgb_pred") != pl.col("rf_pred")).alias("opposite_decision_xgb_vs_rf"),
    (pl.col("xgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_xgb_vs_hgb"),
    (pl.col("cat_pred") != pl.col("lgb_pred")).alias("opposite_decision_cat_vs_lgb"),
    (pl.col("cat_pred") != pl.col("rf_pred")).alias("opposite_decision_cat_vs_rf"),
    (pl.col("cat_pred") != pl.col("hgb__pred")).alias("opposite_decision_cat_vs_hgb"),
    (pl.col("lgb_pred") != pl.col("rf_pred")).alias("opposite_decision_lgb_vs_rf"),
    (pl.col("lgb_pred") != pl.col("hgb__pred")).alias("opposite_decision_lgb_vs_hgb"),
    (pl.col("rf_pred") != pl.col("hgb__pred")).alias("opposite_decision_rf_vs_hgb"),
    (pl.sum_horizontal(["xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred"]) >= 3).cast(pl.Int64).alias("majority_vote"),
    pl.Series("label", y_true_np)
]).select([
    "label", "xgb_pred", "cat_pred", "lgb_pred", "rf_pred", "hgb__pred",
    "opposite_decision_xgb_vs_cat",
    "opposite_decision_xgb_vs_lgb",
    "opposite_decision_xgb_vs_rf",
    "opposite_decision_xgb_vs_hgb",
    "opposite_decision_cat_vs_lgb",
    "opposite_decision_cat_vs_rf",
    "opposite_decision_cat_vs_hgb",
    "opposite_decision_lgb_vs_rf",
    "opposite_decision_lgb_vs_hgb",
    "opposite_decision_rf_vs_hgb",
    "majority_vote"
])

opposite_decisions

label,xgb_pred,cat_pred,lgb_pred,rf_pred,hgb__pred,opposite_decision_xgb_vs_cat,opposite_decision_xgb_vs_lgb,opposite_decision_xgb_vs_rf,opposite_decision_xgb_vs_hgb,opposite_decision_cat_vs_lgb,opposite_decision_cat_vs_rf,opposite_decision_cat_vs_hgb,opposite_decision_lgb_vs_rf,opposite_decision_lgb_vs_hgb,opposite_decision_rf_vs_hgb,majority_vote
bool,i8,i8,i8,i8,i8,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,i64
true,1,0,1,0,0,true,false,true,true,true,false,false,true,true,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,1,1,1,1,1,false,false,false,false,false,false,false,false,false,false,1
false,1,1,1,1,1,false,false,false,false,false,false,false,false,false,false,1
false,1,0,1,0,1,true,false,true,false,true,false,true,true,false,true,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0
false,0,0,1,0,0,false,true,false,false,true,false,false,true,true,false,0


In [64]:
# Count how many times each pair of models give opposite decisions and who is correct in those cases, with percentages
def opposite_decision_stats(preds_df, y_true_col="label"):
    pairs = [
        ("xgb_pred", "cat_pred"),
        ("xgb_pred", "lgb_pred"),
        ("xgb_pred", "rf_pred"),
        ("xgb_pred", "hgb__pred"),
        ("cat_pred", "lgb_pred"),
        ("cat_pred", "rf_pred"),
        ("cat_pred", "hgb__pred"),
        ("lgb_pred", "rf_pred"),
        ("lgb_pred", "hgb__pred"),
        ("rf_pred", "hgb__pred"),
    ]
    n_total = preds_df.height
    stats = []
    for a, b in pairs:
        mask = preds_df[a] != preds_df[b]
        n_opposite = mask.sum()
        correct_a = ((preds_df[a] == preds_df[y_true_col]) & mask).sum()
        correct_b = ((preds_df[b] == preds_df[y_true_col]) & mask).sum()
        stats.append({
            "model_a": a,
            "model_b": b,
            "opposite_count": n_opposite,
            "opposite_pct": round(n_opposite / n_total * 100, 2),
            "model_a_correct": correct_a,
            "model_a_correct_pct": round(correct_a / n_opposite * 100, 2) if n_opposite > 0 else 0,
            "model_b_correct": correct_b,
            "model_b_correct_pct": round(correct_b / n_opposite * 100, 2) if n_opposite > 0 else 0,
        })
    return pl.DataFrame(stats)

opposite_stats = opposite_decision_stats(opposite_decisions)
opposite_stats

model_a,model_b,opposite_count,opposite_pct,model_a_correct,model_a_correct_pct,model_b_correct,model_b_correct_pct
str,str,i64,f64,i64,f64,i64,f64
"""xgb_pred""","""cat_pred""",3521,17.61,2085,59.22,1436,40.78
"""xgb_pred""","""lgb_pred""",14865,74.33,13907,93.56,958,6.44
"""xgb_pred""","""rf_pred""",4815,24.07,3366,69.91,1449,30.09
"""xgb_pred""","""hgb__pred""",4554,22.77,2839,62.34,1715,37.66
"""cat_pred""","""lgb_pred""",13994,69.97,13147,93.95,847,6.05
"""cat_pred""","""rf_pred""",2768,13.84,2018,72.9,750,27.1
"""cat_pred""","""hgb__pred""",2845,14.22,1660,58.35,1185,41.65
"""lgb_pred""","""rf_pred""",12478,62.39,723,5.79,11755,94.21
"""lgb_pred""","""hgb__pred""",13467,67.33,821,6.1,12646,93.9
"""rf_pred""","""hgb__pred""",2359,11.79,783,33.19,1576,66.81


looking at this stats catboost and xgboost are a good combination

In [None]:
# TODO: revwrite the message

**Key Insights:**

Random Forest is severely overfitting - Perfect recall (1.0) and precision (1.0) with 0 Matthews Correlation Coefficient suggests it's memorizing the training data rather than learning generalizable patterns.

**CatBoost (base) is actually your best performer:**

- Highest accuracy: 0.8408
- Best balance across metrics
- Reasonable Matthews CC: 0.0294
- Good ROC_AUC: 0.5521


XGBoost (base) is second best with solid balanced performance across metrics.

# Features importance

In [67]:
# Compute feature importances and their ranks for LightGBM, XGBoost, CatBoost, and RandomForest (base)

# Get feature names
feature_names = X_train.columns

# Create DataFrames for feature importances
lgb_importance = pl.DataFrame({
    "feature": feature_names,
    "importance_lgb": lgb_model_best.feature_importances_
})

xgb_importance = pl.DataFrame({
    "feature": feature_names,
    "importance_xgb": xgb_model_best.feature_importances_
})

cat_importance = pl.DataFrame({
    "feature": feature_names,
    "importance_cat": cat_model_best.feature_importances_
})

rf_importance = pl.DataFrame({
    "feature": feature_names,
    "importance_rf": rf_model_base.feature_importances_
})

# Add rank columns to each importance DataFrame
lgb_importance = lgb_importance.with_columns(
    (pl.col("importance_lgb").rank("dense", descending=True).cast(pl.Int32)).alias("rank_lgb")
)

xgb_importance = xgb_importance.with_columns(
    (pl.col("importance_xgb").rank("dense", descending=True).cast(pl.Int32)).alias("rank_xgb")
)

cat_importance = cat_importance.with_columns(
    (pl.col("importance_cat").rank("dense", descending=True).cast(pl.Int32)).alias("rank_cat")
)

rf_importance = rf_importance.with_columns(
    (pl.col("importance_rf").rank("dense", descending=True).cast(pl.Int32)).alias("rank_rf")
)

# Merge all into a single DataFrame, preserving both importances and ranks
feature_importance_summary_df = (
    lgb_importance
    .join(xgb_importance, on="feature", how="left")
    .join(cat_importance, on="feature", how="left")
    .join(rf_importance, on="feature", how="left")
)

In [68]:
feature_importance_summary_df.sort('rank_cat', descending=False).head(10)

feature,importance_lgb,rank_lgb,importance_xgb,rank_xgb,importance_cat,rank_cat,importance_rf,rank_rf
str,i32,i32,f32,i32,f64,i32,f64,i32
"""age""",119,5,0.031073410063982,3,11.036463520325936,1,0.1649816272411278,1
"""contract_lifetime_days""",185,1,0.0241195261478424,6,8.982108453707756,2,0.0875570786623917,3
"""available_gb""",71,16,0.0625249594449997,1,6.4468253686284775,3,0.132182017723793,2
"""remaining_binding_days""",162,2,0.0216117240488529,14,5.482194138364775,4,0.0511815133856083,5
"""completion_rate""",154,3,0.0190078485757112,22,4.777247833892161,5,0.0439888403963733,6
"""gross_mrc""",146,4,0.0216159466654062,13,4.726602786979138,6,0.0550948084334256,4
"""last_3_delta_1mo""",118,6,0.0189065244048833,24,3.9273807061096258,7,0.0251250148245523,12
"""max_monthly_usage_gb""",99,11,0.0190498307347297,21,3.9071301300965273,8,0.0242252596537197,17
"""last_1_delta_3mo""",116,7,0.018916817381978,23,3.5280609223111576,9,0.0249876845833029,13
"""max_delta_2mo_decrease""",79,14,0.0177404824644327,31,3.4289557248617246,10,0.0223105722275814,20


looking at the most important features for the best model, maybe it is a good experiment to have models on semgnets based on the features