# Model Comparison and Analysis

## Setup and Configuration

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

# Configuration
CONFIG = {
    "results_dir": "model_training_results",
    "output_dir": "model_comparison_analysis",
    "plot_style": {
        "figsize": (15, 8),
        "dpi": 300,
        "style": "whitegrid"
    }
}

# Create output directories
for subdir in ['plots', 'reports']:
    os.makedirs(f"{CONFIG['output_dir']}/{subdir}", exist_ok=True)

print(f"📊 Model Comparison and Analysis")
print(f"Results directory: {CONFIG['results_dir']}")
print(f"Output directory: {CONFIG['output_dir']}")

## Load Model Results

In [None]:
def load_all_model_metrics(results_dir):
    """Load all model metrics from individual JSON files"""
    metrics_dir = f"{results_dir}/metrics"
    
    if not os.path.exists(metrics_dir):
        print(f"❌ Metrics directory not found: {metrics_dir}")
        return pd.DataFrame()
    
    all_metrics = []
    json_files = [f for f in os.listdir(metrics_dir) if f.endswith('.json') and not f.endswith('_error.json')]
    
    print(f"📁 Found {len(json_files)} metric files")
    
    for filename in json_files:
        filepath = os.path.join(metrics_dir, filename)
        try:
            with open(filepath, 'r') as f:
                metrics = json.load(f)
            
            # Skip error files or incomplete metrics
            if 'error' in metrics or metrics.get('accuracy') is None:
                continue
                
            all_metrics.append(metrics)
            
        except Exception as e:
            print(f"⚠️ Could not load {filename}: {e}")
    
    if not all_metrics:
        print("❌ No valid metrics found!")
        return pd.DataFrame()
    
    df = pd.DataFrame(all_metrics)
    print(f"✅ Loaded {len(df)} successful model runs")
    
    return df

def load_label_mapping(results_dir):
    """Load label encoder mapping"""
    mapping_path = f"{results_dir}/label_encoders/label_mapping.json"
    
    if os.path.exists(mapping_path):
        with open(mapping_path, 'r') as f:
            return json.load(f)
    else:
        print("⚠️ Label mapping not found")
        return {}

# Load all data
df_metrics = load_all_model_metrics(CONFIG["results_dir"])
label_mapping = load_label_mapping(CONFIG["results_dir"])

if df_metrics.empty:
    print("❌ No results found! Run training script first.")
else:
    print(f"📊 Ready to analyze {len(df_metrics)} model configurations")

## Process and Clean Data

In [None]:
def add_derived_metrics(df):
    """Add derived metrics and clean up data"""
    if df.empty:
        return df
    
    # Extract model short names
    df['model_short_name'] = df['model_name'].str.split('/').str[-1]
    
    # Create display names
    df['display_name'] = df['model_short_name'] + '_config_' + df['config_id'].astype(str)
    
    # Add hyperparameter info if available
    if 'hyperparameters' in df.columns:
        df['learning_rate'] = df['hyperparameters'].apply(lambda x: x.get('learning_rate', 'N/A') if isinstance(x, dict) else 'N/A')
        df['batch_size'] = df['hyperparameters'].apply(lambda x: x.get('batch_size', 'N/A') if isinstance(x, dict) else 'N/A')
        df['max_length'] = df['hyperparameters'].apply(lambda x: x.get('max_length', 'N/A') if isinstance(x, dict) else 'N/A')
        df['epochs'] = df['hyperparameters'].apply(lambda x: x.get('epochs', 'N/A') if isinstance(x, dict) else 'N/A')
    
    # Sort by performance
    df = df.sort_values('f1_macro', ascending=False)
    
    return df

def analyze_model_performance(df):
    """Analyze overall model performance"""
    if df.empty:
        return {}
    
    analysis = {
        'total_models': len(df),
        'unique_model_types': df['model_short_name'].nunique(),
        'best_model': {
            'name': df.iloc[0]['model_name'],
            'run_name': df.iloc[0]['run_name'],
            'f1_macro': df.iloc[0]['f1_macro'],
            'accuracy': df.iloc[0]['accuracy']
        },
        'performance_stats': {
            'f1_macro': {
                'mean': df['f1_macro'].mean(),
                'std': df['f1_macro'].std(),
                'min': df['f1_macro'].min(),
                'max': df['f1_macro'].max()
            },
            'accuracy': {
                'mean': df['accuracy'].mean(),
                'std': df['accuracy'].std(),
                'min': df['accuracy'].min(),
                'max': df['accuracy'].max()
            }
        }
    }
    
    return analysis

# Process the data
df_clean = add_derived_metrics(df_metrics)
analysis = analyze_model_performance(df_clean)

if analysis:
    print("📊 PERFORMANCE SUMMARY")
    print("=" * 50)
    print(f"Total configurations tested: {analysis['total_models']}")
    print(f"Unique model types: {analysis['unique_model_types']}")
    print(f"Best model: {analysis['best_model']['name'].split('/')[-1]}")
    print(f"Best F1-Macro: {analysis['best_model']['f1_macro']:.4f}")
    print(f"Best Accuracy: {analysis['best_model']['accuracy']:.4f}")

## Create Comprehensive Comparison Plot

In [None]:
def create_comprehensive_comparison_plot(df):
    """Create comprehensive model comparison plots"""
    if df.empty:
        print("❌ No data to plot")
        return None
    
    fig = plt.figure(figsize=(20, 12))
    
    # Plot 1: F1-Macro vs Accuracy Scatter
    ax1 = plt.subplot(2, 3, 1)
    scatter = ax1.scatter(df['accuracy'], df['f1_macro'], 
                         c=df.index, cmap='viridis', alpha=0.7, s=100)
    ax1.set_xlabel('Accuracy')
    ax1.set_ylabel('F1-Macro Score')
    ax1.set_title('F1-Macro vs Accuracy')
    
    # Add labels for best models
    for i, row in df.head(3).iterrows():
        ax1.annotate(row['model_short_name'], 
                    (row['accuracy'], row['f1_macro']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8, alpha=0.8)
    
    # Plot 2: Top 10 Models Bar Chart
    ax2 = plt.subplot(2, 3, 2)
    top_models = df.head(10)
    bars = ax2.barh(range(len(top_models)), top_models['f1_macro'], 
                    color=plt.cm.viridis(np.linspace(0, 1, len(top_models))))
    ax2.set_xlabel('F1-Macro Score')
    ax2.set_title('Top 10 Models by F1-Macro')
    ax2.set_yticks(range(len(top_models)))
    ax2.set_yticklabels([name[:20] + '...' if len(name) > 20 else name 
                        for name in top_models['display_name']], fontsize=8)
    
    # Add value labels
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax2.text(width + 0.005, bar.get_y() + bar.get_height()/2, 
                f'{width:.3f}', ha='left', va='center', fontsize=8)
    
    # Plot 3: Model Type Performance
    ax3 = plt.subplot(2, 3, 3)
    model_performance = df.groupby('model_short_name')['f1_macro'].agg(['mean', 'max', 'count'])
    model_performance = model_performance.sort_values('mean', ascending=True)
    
    bars = ax3.barh(range(len(model_performance)), model_performance['mean'])
    ax3.set_xlabel('Average F1-Macro Score')
    ax3.set_title('Average Performance by Model Type')
    ax3.set_yticks(range(len(model_performance)))
    ax3.set_yticklabels(model_performance.index, fontsize=8)
    
    # Plot 4: Hyperparameter Analysis - Learning Rate
    if 'learning_rate' in df.columns:
        ax4 = plt.subplot(2, 3, 4)
        lr_performance = df.groupby('learning_rate')['f1_macro'].mean().sort_values()
        bars = ax4.bar(range(len(lr_performance)), lr_performance.values)
        ax4.set_xlabel('Learning Rate')
        ax4.set_ylabel('Average F1-Macro')
        ax4.set_title('Performance by Learning Rate')
        ax4.set_xticks(range(len(lr_performance)))
        ax4.set_xticklabels([f'{lr:.0e}' if isinstance(lr, float) else str(lr) 
                            for lr in lr_performance.index], rotation=45)
    
    # Plot 5: Hyperparameter Analysis - Max Length
    if 'max_length' in df.columns:
        ax5 = plt.subplot(2, 3, 5)
        ml_performance = df.groupby('max_length')['f1_macro'].mean().sort_index()
        bars = ax5.bar(range(len(ml_performance)), ml_performance.values)
        ax5.set_xlabel('Max Length')
        ax5.set_ylabel('Average F1-Macro')
        ax5.set_title('Performance by Max Length')
        ax5.set_xticks(range(len(ml_performance)))
        ax5.set_xticklabels(ml_performance.index, rotation=45)
    
    # Plot 6: Performance Distribution
    ax6 = plt.subplot(2, 3, 6)
    ax6.hist(df['f1_macro'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    ax6.axvline(df['f1_macro'].mean(), color='red', linestyle='--', 
               label=f'Mean: {df["f1_macro"].mean():.3f}')
    ax6.axvline(df['f1_macro'].median(), color='green', linestyle='--', 
               label=f'Median: {df["f1_macro"].median():.3f}')
    ax6.set_xlabel('F1-Macro Score')
    ax6.set_ylabel('Frequency')
    ax6.set_title('F1-Macro Score Distribution')
    ax6.legend()
    
    plt.tight_layout()
    
    plot_path = f"{CONFIG['output_dir']}/plots/comprehensive_model_comparison.png"
    plt.savefig(plot_path, dpi=CONFIG['plot_style']['dpi'], bbox_inches='tight')
    plt.show()
    
    return plot_path

# Create the main comparison plot
plot_path = create_comprehensive_comparison_plot(df_clean)

if plot_path:
    print(f"📊 Comprehensive comparison plot saved: {plot_path}")

## Generate Performance Report

In [None]:
def create_detailed_performance_report(df, analysis, label_mapping):
    """Create detailed performance report"""
    if df.empty:
        return None
    
    report = []
    report.append("# 📊 COMPREHENSIVE MODEL PERFORMANCE REPORT")
    report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")
    
    # Executive Summary
    report.append("## 🎯 Executive Summary")
    report.append(f"- **Total Models Evaluated**: {analysis['total_models']}")
    report.append(f"- **Unique Model Types**: {analysis['unique_model_types']}")
    report.append(f"- **Best Model**: {analysis['best_model']['name'].split('/')[-1]}")
    report.append(f"- **Best F1-Macro Score**: {analysis['best_model']['f1_macro']:.4f}")
    report.append(f"- **Best Accuracy**: {analysis['best_model']['accuracy']:.4f}")
    report.append("")
    
    # Performance Statistics
    report.append("## 📈 Performance Statistics")
    stats = analysis['performance_stats']
    report.append("### F1-Macro Scores")
    report.append(f"- Mean: {stats['f1_macro']['mean']:.4f} ± {stats['f1_macro']['std']:.4f}")
    report.append(f"- Range: {stats['f1_macro']['min']:.4f} - {stats['f1_macro']['max']:.4f}")
    report.append("")
    report.append("### Accuracy Scores")
    report.append(f"- Mean: {stats['accuracy']['mean']:.4f} ± {stats['accuracy']['std']:.4f}")
    report.append(f"- Range: {stats['accuracy']['min']:.4f} - {stats['accuracy']['max']:.4f}")
    report.append("")
    
    # Top 10 Models
    report.append("## 🏆 Top 10 Models")
    top_10 = df.head(10)
    for i, (_, row) in enumerate(top_10.iterrows(), 1):
        model_name = row['model_name'].split('/')[-1]
        report.append(f"{i:2d}. **{model_name}** (Config {row['config_id']})")
        report.append(f"    - F1-Macro: {row['f1_macro']:.4f}")
        report.append(f"    - Accuracy: {row['accuracy']:.4f}")
        if 'learning_rate' in row:
            report.append(f"    - Learning Rate: {row['learning_rate']}")
            report.append(f"    - Batch Size: {row['batch_size']}")
            report.append(f"    - Max Length: {row['max_length']}")
        report.append("")
    
    # Model Type Analysis
    report.append("## 🔬 Model Type Analysis")
    model_analysis = df.groupby('model_short_name').agg({
        'f1_macro': ['count', 'mean', 'std', 'max'],
        'accuracy': ['mean', 'max']
    }).round(4)
    
    for model_type in model_analysis.index:
        report.append(f"### {model_type}")
        row = model_analysis.loc[model_type]
        report.append(f"- Configurations tested: {row[('f1_macro', 'count')]}")
        report.append(f"- Average F1-Macro: {row[('f1_macro', 'mean')]:.4f} ± {row[('f1_macro', 'std')]:.4f}")
        report.append(f"- Best F1-Macro: {row[('f1_macro', 'max')]:.4f}")
        report.append(f"- Best Accuracy: {row[('accuracy', 'max')]:.4f}")
        report.append("")
    
    # Hyperparameter Analysis
    if 'learning_rate' in df.columns:
        report.append("## ⚙️ Hyperparameter Analysis")
        
        # Learning Rate
        lr_analysis = df.groupby('learning_rate')['f1_macro'].agg(['count', 'mean', 'std']).round(4)
        report.append("### Learning Rate Impact")
        for lr in lr_analysis.index:
            row = lr_analysis.loc[lr]
            report.append(f"- **{lr}**: {row['mean']:.4f} ± {row['std']:.4f} (n={row['count']})")
        report.append("")
        
        # Max Length
        if 'max_length' in df.columns:
            ml_analysis = df.groupby('max_length')['f1_macro'].agg(['count', 'mean', 'std']).round(4)
            report.append("### Max Length Impact")
            for ml in sorted(ml_analysis.index):
                row = ml_analysis.loc[ml]
                report.append(f"- **{ml}**: {row['mean']:.4f} ± {row['std']:.4f} (n={row['count']})")
            report.append("")
    
    # Label Information
    if label_mapping:
        report.append("## 🏷️ Label Information")
        report.append("### Class Labels")
        for label_id, label_name in label_mapping.items():
            report.append(f"- {label_id}: {label_name}")
        report.append("")
    
    # Recommendations
    report.append("## 💡 Recommendations")
    
    # Best model recommendation
    best_model = df.iloc[0]
    report.append(f"1. **Use {best_model['model_short_name']}** as primary model")
    report.append(f"   - Run name: {best_model['run_name']}")
    
    # Hyperparameter recommendations
    if 'learning_rate' in df.columns:
        best_lr = df.loc[df['f1_macro'].idxmax(), 'learning_rate']
        avg_performance_by_lr = df.groupby('learning_rate')['f1_macro'].mean()
        best_avg_lr = avg_performance_by_lr.idxmax()
        report.append(f"2. **Optimal Learning Rate**: {best_avg_lr} (average performance)")
        
        if 'max_length' in df.columns:
            best_ml = df.loc[df['f1_macro'].idxmax(), 'max_length']
            report.append(f"3. **Optimal Max Length**: {best_ml}")
    
    # Model diversity recommendation
    top_3_models = df.head(3)['model_short_name'].unique()
    if len(top_3_models) > 1:
        report.append(f"4. **Consider Ensemble**: Top performing model types are {', '.join(top_3_models)}")
    
    report.append("")
    report.append("---")
    report.append("*Report generated by Model Comparison Analysis*")
    
    # Save report
    report_path = f"{CONFIG['output_dir']}/reports/performance_report.md"
    with open(report_path, 'w') as f:
        f.write('\n'.join(report))
    
    return report_path

# Generate the detailed report
report_path = create_detailed_performance_report(df_clean, analysis, label_mapping)

if report_path:
    print(f"📝 Performance report saved: {report_path}")

## Save Complete Results Table

In [None]:
# Save comprehensive results
if not df_clean.empty:
    results_path = f"{CONFIG['output_dir']}/complete_model_comparison.csv"
    df_clean.to_csv(results_path, index=False)
    print(f"💾 Complete results saved: {results_path}")

## Final Summary

In [None]:
def print_final_summary(df, analysis):
    """Print final analysis summary"""
    if df.empty or not analysis:
        print("❌ No analysis results available")
        return
    
    print(f"\n{'='*80}")
    print("✅ ANALYSIS COMPLETE!")
    print(f"{'='*80}")
    print(f"📊 Results Summary:")
    print(f"   - Total configurations analyzed: {analysis['total_models']}")
    print(f"   - Best model: {analysis['best_model']['name'].split('/')[-1]}")
    print(f"   - Best F1-Macro: {analysis['best_model']['f1_macro']:.4f}")
    print(f"   - Best Accuracy: {analysis['best_model']['accuracy']:.4f}")
    print(f"\n📁 Files Generated:")
    print(f"   - Comprehensive plot: {CONFIG['output_dir']}/plots/comprehensive_model_comparison.png")
    print(f"   - Performance report: {CONFIG['output_dir']}/reports/performance_report.md")
    print(f"   - Complete results: {CONFIG['output_dir']}/complete_model_comparison.csv")
    print(f"\n🎯 Analysis saved to: {CONFIG['output_dir']}")

# Print the final summary
print_final_summary(df_clean, analysis)