# 05_metrics_dashboard — Model Comparison Dashboard

This notebook provides a comprehensive comparison between different models:
- **CNN**: 1D Convolutional Neural Network with global/local views
- **XGBoost**: Gradient Boosting with engineered features
- **Other models**: Random Forest, SVM, etc. (if available)

Metrics compared:
- PR-AUC / ROC-AUC
- Calibration (ECE, Brier Score)
- Inference Latency
- Model Size and Complexity

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Check for plotly (for interactive plots)
try:
    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots
    plotly_available = True
except ImportError:
    plotly_available = False
    print("⚠ Plotly not available; interactive plots disabled")

print(f"Dashboard initialized")
print(f"Interactive plots: {'✓ Available' if plotly_available else '✗ Not available'}")

## 1. Load Model Metrics

In [None]:
def load_metrics(reports_dir='../reports'):
    """Load metrics from all available models."""
    reports_path = Path(reports_dir)
    metrics = {}
    
    # Patterns to search for
    patterns = [
        'metrics_cnn.json',
        'metrics_xgb.json',
        'metrics_xgboost.json',
        'metrics_rf.json',
        'metrics_svm.json',
        'inference_metrics_*.json'
    ]
    
    for pattern in patterns:
        for file_path in reports_path.glob(pattern):
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    
                # Extract model name
                if 'model' in data:
                    model_name = data['model']
                elif 'model_type' in data:
                    model_name = data['model_type']
                else:
                    # Infer from filename
                    model_name = file_path.stem.replace('metrics_', '').replace('inference_metrics_', '').upper()
                
                metrics[model_name] = data
                print(f"✓ Loaded metrics for {model_name} from {file_path.name}")
                
            except Exception as e:
                print(f"✗ Failed to load {file_path}: {e}")
    
    return metrics

# Load all available metrics
all_metrics = load_metrics()

if not all_metrics:
    print("\n⚠ No metrics found. Generating synthetic data for demonstration...")
    # Generate synthetic metrics for demo
    all_metrics = {
        'CNN': {
            'model': 'CNN1D',
            'performance': {
                'test_accuracy': 0.945,
                'test_precision': 0.932,
                'test_recall': 0.958,
                'test_f1': 0.945,
                'test_roc_auc': 0.982,
                'test_pr_auc': 0.978
            },
            'calibration': {
                'ece_before': 0.082,
                'ece_after': 0.021,
                'brier_before': 0.095,
                'brier_after': 0.052
            },
            'inference_speed': {
                '1': {'latency_ms': 2.3, 'throughput_samples_per_sec': 435},
                '32': {'latency_ms': 8.7, 'throughput_samples_per_sec': 3678}
            }
        },
        'XGBoost': {
            'model': 'XGBoost',
            'performance': {
                'test_accuracy': 0.912,
                'test_precision': 0.895,
                'test_recall': 0.931,
                'test_f1': 0.913,
                'test_roc_auc': 0.958,
                'test_pr_auc': 0.952
            },
            'calibration': {
                'ece_before': 0.125,
                'ece_after': 0.038,
                'brier_before': 0.132,
                'brier_after': 0.071
            },
            'inference_speed': {
                '1': {'latency_ms': 0.8, 'throughput_samples_per_sec': 1250},
                '32': {'latency_ms': 3.2, 'throughput_samples_per_sec': 10000}
            }
        }
    }

print(f"\nLoaded metrics for {len(all_metrics)} models: {list(all_metrics.keys())}")

## 2. Performance Comparison Table

In [None]:
# Create comparison dataframe
comparison_data = []

for model_name, metrics in all_metrics.items():
    row = {'Model': model_name}
    
    # Performance metrics
    if 'performance' in metrics:
        perf = metrics['performance']
        row['Accuracy'] = perf.get('test_accuracy', 0) * 100
        row['Precision'] = perf.get('test_precision', 0) * 100
        row['Recall'] = perf.get('test_recall', 0) * 100
        row['F1'] = perf.get('test_f1', 0) * 100
        row['ROC-AUC'] = perf.get('test_roc_auc', 0)
        row['PR-AUC'] = perf.get('test_pr_auc', 0)
    
    # Calibration metrics
    if 'calibration' in metrics:
        cal = metrics['calibration']
        row['ECE (before)'] = cal.get('ece_before', 0)
        row['ECE (after)'] = cal.get('ece_after', 0)
        row['Brier (after)'] = cal.get('brier_after', 0)
    
    # Inference speed (batch size 32)
    if 'inference_speed' in metrics:
        if '32' in metrics['inference_speed']:
            speed = metrics['inference_speed']['32']
            row['Latency (ms)'] = speed.get('latency_ms', 0)
            row['Throughput'] = speed.get('throughput_samples_per_sec', 0)
    
    comparison_data.append(row)

# Create DataFrame
comparison_df = pd.DataFrame(comparison_data)

# Sort by ROC-AUC
if 'ROC-AUC' in comparison_df.columns:
    comparison_df = comparison_df.sort_values('ROC-AUC', ascending=False)

# Display with formatting
print("\n" + "="*80)
print("MODEL PERFORMANCE COMPARISON")
print("="*80)

# Format for display
display_df = comparison_df.copy()

# Format percentage columns
pct_cols = ['Accuracy', 'Precision', 'Recall', 'F1']
for col in pct_cols:
    if col in display_df.columns:
        display_df[col] = display_df[col].apply(lambda x: f"{x:.1f}%")

# Format float columns
float_cols = ['ROC-AUC', 'PR-AUC', 'ECE (before)', 'ECE (after)', 'Brier (after)']
for col in float_cols:
    if col in display_df.columns:
        display_df[col] = display_df[col].apply(lambda x: f"{x:.3f}")

# Format latency
if 'Latency (ms)' in display_df.columns:
    display_df['Latency (ms)'] = display_df['Latency (ms)'].apply(lambda x: f"{x:.1f}")

# Format throughput
if 'Throughput' in display_df.columns:
    display_df['Throughput'] = display_df['Throughput'].apply(lambda x: f"{x:.0f}/s")

print(display_df.to_string(index=False))

# Highlight best performers
print("\n" + "-"*80)
if len(comparison_df) > 1:
    print("Best Performers:")
    for col in ['ROC-AUC', 'PR-AUC', 'F1']:
        if col in comparison_df.columns:
            best_model = comparison_df.loc[comparison_df[col].idxmax(), 'Model']
            best_value = comparison_df[col].max()
            if col == 'F1':
                print(f"  {col:12s}: {best_model} ({best_value:.1f}%)")
            else:
                print(f"  {col:12s}: {best_model} ({best_value:.3f})")

## 3. Visual Comparisons

In [None]:
# Create comparison visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. ROC-AUC and PR-AUC comparison
if 'ROC-AUC' in comparison_df.columns and 'PR-AUC' in comparison_df.columns:
    models = comparison_df['Model'].values
    x = np.arange(len(models))
    width = 0.35
    
    axes[0,0].bar(x - width/2, comparison_df['ROC-AUC'], width, label='ROC-AUC', color='skyblue')
    axes[0,0].bar(x + width/2, comparison_df['PR-AUC'], width, label='PR-AUC', color='lightcoral')
    axes[0,0].set_xlabel('Model')
    axes[0,0].set_ylabel('AUC Score')
    axes[0,0].set_title('ROC-AUC vs PR-AUC')
    axes[0,0].set_xticks(x)
    axes[0,0].set_xticklabels(models)
    axes[0,0].legend()
    axes[0,0].set_ylim([0, 1])
    axes[0,0].grid(True, alpha=0.3)

# 2. F1 Score comparison
if 'F1' in comparison_df.columns:
    axes[0,1].bar(comparison_df['Model'], comparison_df['F1'], color='mediumseagreen')
    axes[0,1].set_xlabel('Model')
    axes[0,1].set_ylabel('F1 Score (%)')
    axes[0,1].set_title('F1 Score Comparison')
    axes[0,1].set_ylim([0, 100])
    axes[0,1].grid(True, alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(comparison_df['F1']):
        axes[0,1].text(i, v + 1, f'{v:.1f}%', ha='center')

# 3. Calibration comparison (ECE)
if 'ECE (before)' in comparison_df.columns and 'ECE (after)' in comparison_df.columns:
    models = comparison_df['Model'].values
    x = np.arange(len(models))
    width = 0.35
    
    axes[0,2].bar(x - width/2, comparison_df['ECE (before)'], width, label='Before Cal.', color='salmon')
    axes[0,2].bar(x + width/2, comparison_df['ECE (after)'], width, label='After Cal.', color='lightgreen')
    axes[0,2].set_xlabel('Model')
    axes[0,2].set_ylabel('ECE')
    axes[0,2].set_title('Expected Calibration Error')
    axes[0,2].set_xticks(x)
    axes[0,2].set_xticklabels(models)
    axes[0,2].legend()
    axes[0,2].grid(True, alpha=0.3)

# 4. Precision vs Recall
if 'Precision' in comparison_df.columns and 'Recall' in comparison_df.columns:
    axes[1,0].scatter(comparison_df['Recall'], comparison_df['Precision'], s=200)
    for i, model in enumerate(comparison_df['Model']):
        axes[1,0].annotate(model, 
                          (comparison_df.iloc[i]['Recall'], comparison_df.iloc[i]['Precision']),
                          textcoords="offset points", xytext=(0,10), ha='center')
    axes[1,0].set_xlabel('Recall (%)')
    axes[1,0].set_ylabel('Precision (%)')
    axes[1,0].set_title('Precision vs Recall')
    axes[1,0].set_xlim([0, 100])
    axes[1,0].set_ylim([0, 100])
    axes[1,0].grid(True, alpha=0.3)

# 5. Latency comparison
if 'Latency (ms)' in comparison_df.columns:
    axes[1,1].bar(comparison_df['Model'], comparison_df['Latency (ms)'], color='plum')
    axes[1,1].set_xlabel('Model')
    axes[1,1].set_ylabel('Latency (ms)')
    axes[1,1].set_title('Inference Latency (Batch=32)')
    axes[1,1].grid(True, alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(comparison_df['Latency (ms)']):
        axes[1,1].text(i, v + 0.1, f'{v:.1f}', ha='center')

# 6. Throughput comparison
if 'Throughput' in comparison_df.columns:
    axes[1,2].bar(comparison_df['Model'], comparison_df['Throughput'], color='gold')
    axes[1,2].set_xlabel('Model')
    axes[1,2].set_ylabel('Samples/sec')
    axes[1,2].set_title('Inference Throughput (Batch=32)')
    axes[1,2].grid(True, alpha=0.3)
    
    # Log scale if values vary greatly
    if comparison_df['Throughput'].max() / comparison_df['Throughput'].min() > 10:
        axes[1,2].set_yscale('log')

plt.suptitle('Model Performance Dashboard', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../reports/model_comparison_dashboard.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n✓ Dashboard saved to ../reports/model_comparison_dashboard.png")

## 4. Interactive Dashboard (Plotly)

In [None]:
if plotly_available and len(comparison_df) > 0:
    # Create interactive subplot
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=(
            'ROC-AUC vs PR-AUC', 'F1 Score', 'Calibration (ECE)',
            'Precision vs Recall', 'Inference Latency', 'Throughput'
        ),
        specs=[
            [{'type': 'bar'}, {'type': 'bar'}, {'type': 'bar'}],
            [{'type': 'scatter'}, {'type': 'bar'}, {'type': 'bar'}]
        ]
    )
    
    # ROC-AUC vs PR-AUC
    if 'ROC-AUC' in comparison_df.columns:
        fig.add_trace(
            go.Bar(name='ROC-AUC', x=comparison_df['Model'], y=comparison_df['ROC-AUC'],
                  marker_color='lightblue'),
            row=1, col=1
        )
        fig.add_trace(
            go.Bar(name='PR-AUC', x=comparison_df['Model'], y=comparison_df['PR-AUC'],
                  marker_color='lightcoral'),
            row=1, col=1
        )
    
    # F1 Score
    if 'F1' in comparison_df.columns:
        fig.add_trace(
            go.Bar(x=comparison_df['Model'], y=comparison_df['F1'],
                  marker_color='mediumseagreen', showlegend=False),
            row=1, col=2
        )
    
    # ECE
    if 'ECE (after)' in comparison_df.columns:
        fig.add_trace(
            go.Bar(x=comparison_df['Model'], y=comparison_df['ECE (after)'],
                  marker_color='lightgreen', showlegend=False),
            row=1, col=3
        )
    
    # Precision vs Recall
    if 'Precision' in comparison_df.columns and 'Recall' in comparison_df.columns:
        fig.add_trace(
            go.Scatter(
                x=comparison_df['Recall'], y=comparison_df['Precision'],
                mode='markers+text',
                text=comparison_df['Model'],
                textposition='top center',
                marker=dict(size=15, color=range(len(comparison_df))),
                showlegend=False
            ),
            row=2, col=1
        )
    
    # Latency
    if 'Latency (ms)' in comparison_df.columns:
        fig.add_trace(
            go.Bar(x=comparison_df['Model'], y=comparison_df['Latency (ms)'],
                  marker_color='plum', showlegend=False),
            row=2, col=2
        )
    
    # Throughput
    if 'Throughput' in comparison_df.columns:
        fig.add_trace(
            go.Bar(x=comparison_df['Model'], y=comparison_df['Throughput'],
                  marker_color='gold', showlegend=False),
            row=2, col=3
        )
    
    # Update layout
    fig.update_layout(
        height=700,
        showlegend=True,
        title_text="<b>Interactive Model Comparison Dashboard</b>",
        title_font_size=18
    )
    
    # Update axes
    fig.update_xaxes(title_text="Model", row=1, col=1)
    fig.update_yaxes(title_text="Score", row=1, col=1)
    
    fig.update_xaxes(title_text="Recall (%)", row=2, col=1)
    fig.update_yaxes(title_text="Precision (%)", row=2, col=1)
    
    # Save and show
    fig.write_html('../reports/interactive_dashboard.html')
    fig.show()
    
    print("\n✓ Interactive dashboard saved to ../reports/interactive_dashboard.html")
else:
    if not plotly_available:
        print("\n⚠ Plotly not available. Install with: pip install plotly")
    else:
        print("\n⚠ No data available for interactive dashboard")

## 5. Model Trade-off Analysis

In [None]:
# Trade-off analysis
print("\n" + "="*80)
print("MODEL TRADE-OFF ANALYSIS")
print("="*80)

if len(comparison_df) > 1:
    for _, row in comparison_df.iterrows():
        model = row['Model']
        print(f"\n{model}:")
        print("-" * 40)
        
        # Strengths
        strengths = []
        if 'ROC-AUC' in row and row['ROC-AUC'] == comparison_df['ROC-AUC'].max():
            strengths.append("Best ROC-AUC")
        if 'F1' in row and row['F1'] == comparison_df['F1'].max():
            strengths.append("Best F1 Score")
        if 'Latency (ms)' in row and row['Latency (ms)'] == comparison_df['Latency (ms)'].min():
            strengths.append("Lowest latency")
        if 'Throughput' in row and row['Throughput'] == comparison_df['Throughput'].max():
            strengths.append("Highest throughput")
        if 'ECE (after)' in row and row['ECE (after)'] == comparison_df['ECE (after)'].min():
            strengths.append("Best calibrated")
        
        if strengths:
            print(f"  Strengths: {', '.join(strengths)}")
        
        # Key metrics
        if 'ROC-AUC' in row:
            print(f"  ROC-AUC: {row['ROC-AUC']:.3f}")
        if 'F1' in row:
            print(f"  F1 Score: {row['F1']:.1f}%")
        if 'Latency (ms)' in row:
            print(f"  Latency: {row['Latency (ms)']:.1f} ms")
        if 'Throughput' in row:
            print(f"  Throughput: {row['Throughput']:.0f} samples/sec")

# Recommendations
print("\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)

if len(comparison_df) > 0:
    # Best for accuracy
    if 'ROC-AUC' in comparison_df.columns:
        best_acc = comparison_df.loc[comparison_df['ROC-AUC'].idxmax(), 'Model']
        print(f"\n📊 For maximum accuracy: Use {best_acc}")
    
    # Best for speed
    if 'Latency (ms)' in comparison_df.columns:
        best_speed = comparison_df.loc[comparison_df['Latency (ms)'].idxmin(), 'Model']
        print(f"⚡ For real-time inference: Use {best_speed}")
    
    # Best balanced
    if 'F1' in comparison_df.columns:
        best_f1 = comparison_df.loc[comparison_df['F1'].idxmax(), 'Model']
        print(f"⚖️  For balanced performance: Use {best_f1}")
    
    # Model-specific recommendations
    if 'CNN' in comparison_df['Model'].values:
        print("\n🔍 CNN: Best for complex patterns, requires GPU for optimal performance")
    if 'XGBoost' in comparison_df['Model'].values:
        print("🌲 XGBoost: Good balance of accuracy and speed, works well on CPU")

## 6. Export Summary Report

In [None]:
# Generate summary report
summary = {
    'n_models': len(comparison_df),
    'models': list(comparison_df['Model'].values),
    'best_performers': {},
    'comparison_table': comparison_df.to_dict('records')
}

# Find best performers
metrics_to_check = [
    ('ROC-AUC', 'max'),
    ('PR-AUC', 'max'),
    ('F1', 'max'),
    ('Accuracy', 'max'),
    ('Latency (ms)', 'min'),
    ('Throughput', 'max'),
    ('ECE (after)', 'min')
]

for metric, op in metrics_to_check:
    if metric in comparison_df.columns:
        if op == 'max':
            best_idx = comparison_df[metric].idxmax()
            best_value = comparison_df[metric].max()
        else:
            best_idx = comparison_df[metric].idxmin()
            best_value = comparison_df[metric].min()
        
        summary['best_performers'][metric] = {
            'model': comparison_df.loc[best_idx, 'Model'],
            'value': float(best_value)
        }

# Save summary
summary_file = Path('../reports/model_comparison_summary.json')
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Summary report saved to {summary_file}")

# Also save comparison table as CSV
csv_file = Path('../reports/model_comparison_table.csv')
comparison_df.to_csv(csv_file, index=False)
print(f"✓ Comparison table saved to {csv_file}")

print("\n" + "="*80)
print("✅ Dashboard generation complete!")
print("="*80)
print("\nGenerated files:")
print("  📊 model_comparison_dashboard.png - Visual comparisons")
print("  📈 interactive_dashboard.html - Interactive plots (if plotly available)")
print("  📄 model_comparison_summary.json - Detailed metrics")
print("  📋 model_comparison_table.csv - Comparison table")

## Summary

This dashboard provides comprehensive model comparison including:

1. **Performance Metrics**: Accuracy, Precision, Recall, F1, ROC-AUC, PR-AUC
2. **Calibration Analysis**: ECE and Brier scores before/after calibration
3. **Efficiency Metrics**: Inference latency and throughput
4. **Trade-off Analysis**: Strengths and weaknesses of each model
5. **Recommendations**: Which model to use for different scenarios

The dashboard supports multiple models and automatically adapts to available data.