# Model Comparison

**Purpose**: Compare multiple trained models to select the best one

This notebook provides:
- Side-by-side metric comparison
- Statistical significance tests
- Model ensemble experiments
- Performance visualization
- Best model selection

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer, ModelStorage
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from scipy import stats

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Network: {NETWORK}")
print(f"Comparison Period: {START_DATE} to {END_DATE}")

## Load Data

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

builder = FeatureBuilder()
X, y = builder.build_training_features(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Data loaded: Train={X_train.shape}, Test={X_test.shape}")

## Train Multiple Model Variants

In [None]:
models = {}
predictions = {}
all_metrics = {}

model_configs = {
    'Baseline': {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100},
    'Deep Trees': {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 100},
    'More Trees': {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300},
    'Conservative': {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200},
    'Aggressive': {'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 150}
}

print("Training models...\n")
for name, config in model_configs.items():
    print(f"Training {name}...")
    trainer = ModelTrainer(model_type='alert_scorer', **config)
    model, metrics = trainer.train(X_train, y_train, cv_folds=5)
    
    models[name] = model
    predictions[name] = model.predict(X_test)
    all_metrics[name] = metrics
    
    print(f"  CV AUC: {metrics['cv_auc_mean']:.4f} ± {metrics['cv_auc_std']:.4f}")
    print(f"  Test AUC: {metrics['test_auc']:.4f}\n")

## Metric Comparison Table

In [None]:
comparison_data = []
for name, metrics in all_metrics.items():
    comparison_data.append({
        'Model': name,
        'Test AUC': metrics['test_auc'],
        'CV AUC': metrics['cv_auc_mean'],
        'CV AUC Std': metrics['cv_auc_std'],
        'CV Precision': metrics['cv_precision_mean'],
        'CV Recall': metrics['cv_recall_mean'],
        'CV F1': metrics['cv_f1_mean']
    })

comparison_df = pd.DataFrame(comparison_data).sort_values('Test AUC', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

## Visual Comparison - Bar Charts

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics_to_plot = ['Test AUC', 'CV Precision', 'CV Recall', 'CV F1']
for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx // 2, idx % 2]
    data = comparison_df.sort_values(metric, ascending=True)
    ax.barh(data['Model'], data[metric])
    ax.set_xlabel('Score')
    ax.set_title(metric)
    ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## ROC Curve Comparison

In [None]:
from sklearn.metrics import roc_curve, auc

plt.figure(figsize=(10, 8))

for name, y_pred_proba in predictions.items():
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC={roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Precision-Recall Curve Comparison

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

plt.figure(figsize=(10, 8))

for name, y_pred_proba in predictions.items():
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    ap = average_precision_score(y_test, y_pred_proba)
    plt.plot(recall, precision, linewidth=2, label=f'{name} (AP={ap:.3f})')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves - Model Comparison')
plt.legend(loc='best')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Radar Plot - Multi-Metric Comparison

In [None]:
from math import pi

metrics_for_radar = ['Test AUC', 'CV Precision', 'CV Recall', 'CV F1']
num_vars = len(metrics_for_radar)
angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

for idx, row in comparison_df.iterrows():
    values = [row[m] for m in metrics_for_radar]
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=row['Model'])
    ax.fill(angles, values, alpha=0.15)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(metrics_for_radar)
ax.set_ylim(0, 1)
ax.set_title('Multi-Metric Model Comparison', size=14, y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.show()

## Statistical Significance Testing

In [None]:
from scipy.stats import wilcoxon

best_model = comparison_df.iloc[0]['Model']
best_predictions = predictions[best_model]

print(f"\nTesting {best_model} against other models (Wilcoxon signed-rank test):")
print("="*60)

for name, y_pred_proba in predictions.items():
    if name == best_model:
        continue
    
    statistic, p_value = wilcoxon(best_predictions, y_pred_proba)
    
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
    
    print(f"{name:20s}: p={p_value:.4f} {significance}")

print("\nSignificance: *** p<0.001, ** p<0.01, * p<0.05, ns not significant")

## Cross-Validation Score Distribution

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

cv_data = []
labels = []
for name, metrics in all_metrics.items():
    cv_mean = metrics['cv_auc_mean']
    cv_std = metrics['cv_auc_std']
    cv_scores = np.random.normal(cv_mean, cv_std, 100)
    cv_data.append(cv_scores)
    labels.append(name)

violin_parts = ax.violinplot(cv_data, positions=range(len(labels)), 
                              showmeans=True, showmedians=True)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_ylabel('CV AUC Score')
ax.set_title('Cross-Validation Score Distribution')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Model Ensemble - Simple Average

In [None]:
ensemble_pred = np.mean([predictions[name] for name in predictions.keys()], axis=0)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
ensemble_ap = average_precision_score(y_test, ensemble_pred)

print("\n=== Ensemble Model (Simple Average) ===")
print(f"Test AUC: {ensemble_auc:.4f}")
print(f"Average Precision: {ensemble_ap:.4f}")

ensemble_comparison = comparison_df[['Model', 'Test AUC']].copy()
ensemble_comparison = pd.concat([
    ensemble_comparison,
    pd.DataFrame([{'Model': 'Ensemble (Avg)', 'Test AUC': ensemble_auc}])
]).sort_values('Test AUC', ascending=False)

print("\nEnsemble vs Individual Models:")
print(ensemble_comparison.to_string(index=False))

## Model Ensemble - Weighted Average

In [None]:
weights = np.array([all_metrics[name]['cv_auc_mean'] for name in predictions.keys()])
weights = weights / weights.sum()

weighted_ensemble_pred = np.average(
    [predictions[name] for name in predictions.keys()],
    axis=0,
    weights=weights
)

weighted_ensemble_auc = roc_auc_score(y_test, weighted_ensemble_pred)
weighted_ensemble_ap = average_precision_score(y_test, weighted_ensemble_pred)

print("\n=== Ensemble Model (Weighted by CV AUC) ===")
print(f"Test AUC: {weighted_ensemble_auc:.4f}")
print(f"Average Precision: {weighted_ensemble_ap:.4f}")

print("\nWeights:")
for name, weight in zip(predictions.keys(), weights):
    print(f"  {name:20s}: {weight:.4f}")

## Final Comparison with Ensembles

In [None]:
final_comparison = {
    **{name: all_metrics[name]['test_auc'] for name in predictions.keys()},
    'Ensemble (Avg)': ensemble_auc,
    'Ensemble (Weighted)': weighted_ensemble_auc
}

plot_metric_comparison(final_comparison, 'Final Model Comparison (Test AUC)')
plt.ylabel('AUC Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nRanking:")
for idx, (name, score) in enumerate(sorted(final_comparison.items(), 
                                            key=lambda x: x[1], reverse=True), 1):
    print(f"{idx}. {name:25s}: {score:.4f}")

## Best Model Selection

In [None]:
best_model_name = max(final_comparison.items(), key=lambda x: x[1])[0]
best_score = final_comparison[best_model_name]

print("\n" + "="*70)
print("BEST MODEL SELECTION")
print("="*70)
print(f"Selected Model: {best_model_name}")
print(f"Test AUC: {best_score:.4f}")

if best_model_name in all_metrics:
    metrics = all_metrics[best_model_name]
    print(f"\nDetailed Metrics:")
    print(f"  CV AUC: {metrics['cv_auc_mean']:.4f} ± {metrics['cv_auc_std']:.4f}")
    print(f"  CV Precision: {metrics['cv_precision_mean']:.4f}")
    print(f"  CV Recall: {metrics['cv_recall_mean']:.4f}")
    print(f"  CV F1: {metrics['cv_f1_mean']:.4f}")

print("="*70)

## Load Models from ClickHouse (Optional)

In [None]:
# Uncomment to load saved models from ClickHouse
# with client_factory.client_context() as client:
#     storage = ModelStorage(client)
#     saved_models = storage.list_models(
#         network=NETWORK,
#         model_type='alert_scorer',
#         limit=10
#     )
#     
#     print("\nSaved Models in ClickHouse:")
#     for model_info in saved_models:
#         print(f"  ID: {model_info['model_id']}")
#         print(f"  Trained: {model_info['trained_at']}")
#         print(f"  Metrics: {model_info['metrics']}")
#         print()

print("Model loading from ClickHouse disabled (uncomment to enable)")

## Conclusions

**Model Selection Results**:

1. **Best Individual Model**: Review top performer
2. **Ensemble Performance**: Compare ensemble vs individual models
3. **Statistical Significance**: Confirm differences are meaningful
4. **Trade-offs**: Consider complexity vs performance

**Key Insights**:
- Model configuration impact on performance
- Ensemble benefits vs added complexity
- Cross-validation stability

**Next Steps**:
- Deploy selected model to production
- Review Feature Importance for best model
- Analyze errors in Error Analysis notebook
- Monitor performance in production