# Model Evaluation

**Purpose**: Comprehensive model evaluation and validation

This notebook provides:
- Multiple evaluation metrics
- ROC and PR curves
- Confusion matrices
- Calibration analysis
- Score distributions
- Performance by segments

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer, ModelStorage
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, roc_auc_score, average_precision_score,
    precision_recall_fscore_support, log_loss, brier_score_loss
)
from sklearn.calibration import calibration_curve

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Network: {NETWORK}")
print(f"Evaluation Period: {START_DATE} to {END_DATE}")

## Load Data and Train Model

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

builder = FeatureBuilder()
X, y = builder.build_training_features(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training: {X_train.shape}, Test: {X_test.shape}")

In [None]:
trainer = ModelTrainer(model_type='alert_scorer')
model, metrics = trainer.train(X_train, y_train, cv_folds=5)

print("Model trained successfully")

## Generate Predictions

In [None]:
y_train_pred_proba = model.predict(X_train)
y_test_pred_proba = model.predict(X_test)

y_train_pred = (y_train_pred_proba > 0.5).astype(int)
y_test_pred = (y_test_pred_proba > 0.5).astype(int)

print("Predictions generated")

## Core Metrics

In [None]:
train_auc = roc_auc_score(y_train, y_train_pred_proba)
test_auc = roc_auc_score(y_test, y_test_pred_proba)

train_ap = average_precision_score(y_train, y_train_pred_proba)
test_ap = average_precision_score(y_test, y_test_pred_proba)

train_logloss = log_loss(y_train, y_train_pred_proba)
test_logloss = log_loss(y_test, y_test_pred_proba)

train_brier = brier_score_loss(y_train, y_train_pred_proba)
test_brier = brier_score_loss(y_test, y_test_pred_proba)

print("="*50)
print("CORE METRICS")
print("="*50)
print(f"{'Metric':<20} {'Train':>12} {'Test':>12} {'Diff':>12}")
print("-"*50)
print(f"{'AUC':<20} {train_auc:>12.4f} {test_auc:>12.4f} {abs(train_auc-test_auc):>12.4f}")
print(f"{'Average Precision':<20} {train_ap:>12.4f} {test_ap:>12.4f} {abs(train_ap-test_ap):>12.4f}")
print(f"{'Log Loss':<20} {train_logloss:>12.4f} {test_logloss:>12.4f} {abs(train_logloss-test_logloss):>12.4f}")
print(f"{'Brier Score':<20} {train_brier:>12.4f} {test_brier:>12.4f} {abs(train_brier-test_brier):>12.4f}")
print("="*50)

## Classification Report

In [None]:
print("\nTest Set Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Low Risk', 'High Risk']))

## ROC Curve

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

from sklearn.metrics import roc_curve
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_proba)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)

ax1.plot(fpr_train, tpr_train, label=f'Train (AUC={train_auc:.3f})', linewidth=2)
ax1.plot(fpr_test, tpr_test, label=f'Test (AUC={test_auc:.3f})', linewidth=2)
ax1.plot([0, 1], [0, 1], 'k--', label='Random')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve')
ax1.legend()
ax1.grid(True, alpha=0.3)

from sklearn.metrics import precision_recall_curve
precision_train, recall_train, _ = precision_recall_curve(y_train, y_train_pred_proba)
precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_pred_proba)

ax2.plot(recall_train, precision_train, label=f'Train (AP={train_ap:.3f})', linewidth=2)
ax2.plot(recall_test, precision_test, label=f'Test (AP={test_ap:.3f})', linewidth=2)
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision-Recall Curve')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

cm_train = confusion_matrix(y_train, y_train_pred)
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
ax1.set_title('Training Set Confusion Matrix')
ax1.set_xticklabels(['Low Risk', 'High Risk'])
ax1.set_yticklabels(['Low Risk', 'High Risk'])

cm_test = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', ax=ax2)
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')
ax2.set_title('Test Set Confusion Matrix')
ax2.set_xticklabels(['Low Risk', 'High Risk'])
ax2.set_yticklabels(['Low Risk', 'High Risk'])

plt.tight_layout()
plt.show()

## Score Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(y_train_pred_proba[y_train == 0], bins=50, alpha=0.6, label='Actual: Low', edgecolor='black')
axes[0, 0].hist(y_train_pred_proba[y_train == 1], bins=50, alpha=0.6, label='Actual: High', edgecolor='black')
axes[0, 0].set_xlabel('Predicted Probability')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Training Set: Score Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist(y_test_pred_proba[y_test == 0], bins=50, alpha=0.6, label='Actual: Low', edgecolor='black')
axes[0, 1].hist(y_test_pred_proba[y_test == 1], bins=50, alpha=0.6, label='Actual: High', edgecolor='black')
axes[0, 1].set_xlabel('Predicted Probability')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Test Set: Score Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].boxplot([y_train_pred_proba[y_train == 0], y_train_pred_proba[y_train == 1]], 
                     labels=['Low Risk', 'High Risk'])
axes[1, 0].set_ylabel('Predicted Probability')
axes[1, 0].set_title('Training Set: Box Plot')
axes[1, 0].grid(True, alpha=0.3, axis='y')

axes[1, 1].boxplot([y_test_pred_proba[y_test == 0], y_test_pred_proba[y_test == 1]], 
                     labels=['Low Risk', 'High Risk'])
axes[1, 1].set_ylabel('Predicted Probability')
axes[1, 1].set_title('Test Set: Box Plot')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Calibration Curve

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

prob_true_train, prob_pred_train = calibration_curve(y_train, y_train_pred_proba, n_bins=10)
ax1.plot(prob_pred_train, prob_true_train, marker='o', linewidth=2, label='Model')
ax1.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
ax1.set_xlabel('Mean Predicted Probability')
ax1.set_ylabel('Fraction of Positives')
ax1.set_title(f'Training Set Calibration (Brier={train_brier:.4f})')
ax1.legend()
ax1.grid(True, alpha=0.3)

prob_true_test, prob_pred_test = calibration_curve(y_test, y_test_pred_proba, n_bins=10)
ax2.plot(prob_pred_test, prob_true_test, marker='o', linewidth=2, label='Model')
ax2.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
ax2.set_xlabel('Mean Predicted Probability')
ax2.set_ylabel('Fraction of Positives')
ax2.set_title(f'Test Set Calibration (Brier={test_brier:.4f})')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Threshold Analysis

In [None]:
thresholds = np.arange(0.1, 0.9, 0.05)
precision_scores = []
recall_scores = []
f1_scores = []

for threshold in thresholds:
    y_pred_thresh = (y_test_pred_proba > threshold).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred_thresh, average='binary', zero_division=0
    )
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision_scores, marker='o', label='Precision', linewidth=2)
plt.plot(thresholds, recall_scores, marker='s', label='Recall', linewidth=2)
plt.plot(thresholds, f1_scores, marker='^', label='F1 Score', linewidth=2)
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Metrics vs Classification Threshold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

best_f1_idx = np.argmax(f1_scores)
print(f"\nBest threshold for F1: {thresholds[best_f1_idx]:.2f}")
print(f"  Precision: {precision_scores[best_f1_idx]:.4f}")
print(f"  Recall: {recall_scores[best_f1_idx]:.4f}")
print(f"  F1: {f1_scores[best_f1_idx]:.4f}")

## Lift Chart

In [None]:
test_df = pd.DataFrame({
    'actual': y_test,
    'predicted': y_test_pred_proba
})
test_df = test_df.sort_values('predicted', ascending=False).reset_index(drop=True)

deciles = 10
test_df['decile'] = pd.qcut(test_df.index, deciles, labels=False, duplicates='drop') + 1

lift_data = test_df.groupby('decile').agg({
    'actual': ['sum', 'count']
}).reset_index()
lift_data.columns = ['decile', 'positives', 'total']
lift_data['positive_rate'] = lift_data['positives'] / lift_data['total']

baseline_rate = y_test.mean()
lift_data['lift'] = lift_data['positive_rate'] / baseline_rate

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

ax1.bar(lift_data['decile'], lift_data['positive_rate'], alpha=0.7)
ax1.axhline(y=baseline_rate, color='r', linestyle='--', label=f'Baseline: {baseline_rate:.3f}')
ax1.set_xlabel('Decile')
ax1.set_ylabel('Positive Rate')
ax1.set_title('Positive Rate by Score Decile')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

ax2.bar(lift_data['decile'], lift_data['lift'], alpha=0.7)
ax2.axhline(y=1.0, color='r', linestyle='--', label='Baseline')
ax2.set_xlabel('Decile')
ax2.set_ylabel('Lift')
ax2.set_title('Lift by Score Decile')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nLift Analysis:")
print(lift_data)

## Cross-Validation Metrics

In [None]:
cv_metrics = {
    'AUC': [metrics['cv_auc_mean'], metrics['cv_auc_std']],
    'Precision': [metrics['cv_precision_mean'], metrics['cv_precision_std']],
    'Recall': [metrics['cv_recall_mean'], metrics['cv_recall_std']],
    'F1': [metrics['cv_f1_mean'], metrics['cv_f1_std']]
}

cv_df = pd.DataFrame(cv_metrics, index=['Mean', 'Std']).T
print("\nCross-Validation Metrics:")
print(cv_df.round(4))

fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(cv_df))
ax.bar(x, cv_df['Mean'], yerr=cv_df['Std'], capsize=5, alpha=0.7)
ax.set_xticks(x)
ax.set_xticklabels(cv_df.index)
ax.set_ylabel('Score')
ax.set_title('Cross-Validation Metrics (Mean ± Std)')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Performance Summary

In [None]:
summary = {
    'Metric': ['AUC', 'Average Precision', 'Log Loss', 'Brier Score', 'Test Accuracy'],
    'Train': [
        train_auc,
        train_ap,
        train_logloss,
        train_brier,
        (y_train_pred == y_train).mean()
    ],
    'Test': [
        test_auc,
        test_ap,
        test_logloss,
        test_brier,
        (y_test_pred == y_test).mean()
    ]
}

summary_df = pd.DataFrame(summary)
summary_df['Overfitting'] = summary_df['Train'] - summary_df['Test']

print("\n" + "="*70)
print("MODEL EVALUATION SUMMARY")
print("="*70)
print(summary_df.to_string(index=False))
print("="*70)

## Conclusions

**Model Performance**:
- Review AUC, precision, recall, and F1 scores
- Check train/test gap for overfitting
- Analyze calibration quality

**Key Observations**:
- Calibration curve shows prediction reliability
- Lift chart demonstrates model value
- Threshold analysis helps optimize for business needs

**Next Steps**:
- Compare with other models in Model Comparison notebook
- Analyze errors in Error Analysis notebook
- Review feature importance for insights
- Consider deployment if performance is satisfactory