# Model Confidence and Calibration Analysis

## Configuration

In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from datetime import datetime
from scipy import stats
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

CONFIG = {
    "results_dir": "model_training_results",
    "output_dir": "confidence_analysis",
    "confidence_bins": 10,
    "high_confidence_threshold": 0.8,
    "low_confidence_threshold": 0.6,
    "plot_style": {
        "figsize": (15, 10),
        "dpi": 300,
        "style": "whitegrid"
    }
}

os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs(f"{CONFIG['output_dir']}/plots", exist_ok=True)
os.makedirs(f"{CONFIG['output_dir']}/reports", exist_ok=True)


## Data Loading

In [2]:
def load_all_predictions(results_dir):
    predictions_dir = f"{results_dir}/predictions"
    if not os.path.exists(predictions_dir):
        print(f"Error: Predictions directory not found at {predictions_dir}")
        return pd.DataFrame()

    prediction_files = [f for f in os.listdir(predictions_dir) if f.endswith('_predictions.csv')]
    combined = []
    for file in prediction_files:
        try:
            df = pd.read_csv(os.path.join(predictions_dir, file))
            run_name = file.replace('_predictions.csv', '')
            parts = run_name.split('_config_')
            df['model_name'] = parts[0]
            df['run_name'] = run_name
            df['config_id'] = parts[1] if len(parts) > 1 else '0'
            combined.append(df)
        except Exception as e:
            print(f"Could not load {file}: {e}")

    return pd.concat(combined, ignore_index=True) if combined else pd.DataFrame()


def load_model_metrics(results_dir):
    metrics_dir = f"{results_dir}/metrics"
    if not os.path.exists(metrics_dir):
        return {}
    
    metrics = {}
    files = [f for f in os.listdir(metrics_dir) if f.endswith('.json') and not f.endswith('_error.json')]
    for file in files:
        try:
            with open(os.path.join(metrics_dir, file), 'r') as f:
                data = json.load(f)
                if 'error' not in data and data.get('accuracy') is not None:
                    run_name = data.get('run_name', file.replace('.json', ''))
                    metrics[run_name] = data
        except Exception as e:
            print(f"Could not parse {file}: {e}")
    
    return metrics


## Confidence-Accuracy Relationship


In [3]:
def analyze_confidence_accuracy_relationship(df):
    results = {}

    correlation = df['confidence'].corr(df['correct'].astype(int))
    results['overall_correlation'] = correlation

    correct = df[df['correct']]['confidence']
    incorrect = df[~df['correct']]['confidence']

    results['confidence_stats'] = {
        'correct_predictions': {
            'mean': correct.mean(),
            'median': correct.median(),
            'std': correct.std(),
            'count': len(correct)
        },
        'incorrect_predictions': {
            'mean': incorrect.mean(),
            'median': incorrect.median(),
            'std': incorrect.std(),
            'count': len(incorrect)
        }
    }

    stat, p_val = stats.mannwhitneyu(correct, incorrect, alternative='greater')
    results['statistical_test'] = {
        'statistic': stat,
        'p_value': p_val,
        'significant': p_val < 0.05
    }

    thresholds = {
        "high_confidence": df[df['confidence'] >= CONFIG['high_confidence_threshold']],
        "medium_confidence": df[(df['confidence'] >= CONFIG['low_confidence_threshold']) & (df['confidence'] < CONFIG['high_confidence_threshold'])],
        "low_confidence": df[df['confidence'] < CONFIG['low_confidence_threshold']]
    }

    results['confidence_thresholds'] = {
        name: {
            'count': len(group),
            'accuracy': group['correct'].mean() if len(group) > 0 else 0,
            'percentage_of_total': len(group) / len(df) * 100,
            'threshold': CONFIG['high_confidence_threshold'] if name == 'high_confidence' else CONFIG['low_confidence_threshold']
        }
        for name, group in thresholds.items()
    }

    return results


## Model Calibration Metrics


In [4]:
def calculate_model_calibration(df):
    output = {}
    for model in df['run_name'].unique():
        data = df[df['run_name'] == model]
        if len(data) == 0:
            continue

        y_true = data['correct'].astype(int)
        y_prob = data['confidence']

        try:
            prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=CONFIG['confidence_bins'])

            ece = 0
            bins = np.linspace(0, 1, CONFIG['confidence_bins'] + 1)
            for low, high in zip(bins[:-1], bins[1:]):
                mask = (y_prob > low) & (y_prob <= high)
                if mask.any():
                    acc_bin = y_true[mask].mean()
                    conf_bin = y_prob[mask].mean()
                    ece += np.abs(conf_bin - acc_bin) * mask.mean()

            brier = brier_score_loss(y_true, y_prob)

            output[model] = {
                'prob_true': prob_true,
                'prob_pred': prob_pred,
                'ece': ece,
                'brier_score': brier,
                'accuracy': y_true.mean(),
                'avg_confidence': y_prob.mean(),
                'total_predictions': len(data)
            }
        except Exception as e:
            print(f"Could not calibrate {model}: {e}")
    return output


## Problematic Prediction Examples


In [5]:
def find_problematic_predictions(df, n=10):
    errors = df[(df['confidence'] >= CONFIG['high_confidence_threshold']) & (~df['correct'])].nlargest(n, 'confidence')
    successes = df[(df['confidence'] <= CONFIG['low_confidence_threshold']) & (df['correct'])].nsmallest(n, 'confidence')
    return {
        'overconfident_errors': errors,
        'underconfident_successes': successes
    }

## Confidence Analysis Visualizations

In [6]:
def create_confidence_distribution_plot(df):
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    correct_conf = df[df['correct']]['confidence']
    incorrect_conf = df[~df['correct']]['confidence']

    # Histogram
    axes[0, 0].hist(correct_conf, bins=30, alpha=0.7, label='Correct', color='green', density=True)
    axes[0, 0].hist(incorrect_conf, bins=30, alpha=0.7, label='Incorrect', color='red', density=True)
    axes[0, 0].set_title("Confidence Distribution by Correctness")
    axes[0, 0].set_xlabel("Confidence")
    axes[0, 0].set_ylabel("Density")
    axes[0, 0].legend()

    # Accuracy per bin
    df['confidence_bin'] = pd.cut(df['confidence'], bins=CONFIG['confidence_bins'])
    bin_data = df.groupby('confidence_bin')['correct'].agg(['mean', 'count'])
    bin_centers = [interval.mid for interval in bin_data.index]
    
    axes[0, 1].bar(range(len(bin_centers)), bin_data['mean'], color='skyblue', edgecolor='black')
    axes[0, 1].set_title("Accuracy by Confidence Bin")
    axes[0, 1].set_xlabel("Confidence Bin")
    axes[0, 1].set_ylabel("Accuracy")
    axes[0, 1].set_xticks(range(len(bin_centers)))
    axes[0, 1].set_xticklabels([f"{c:.2f}" for c in bin_centers], rotation=45)

    # Calibration curve
    axes[1, 0].plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
    axes[1, 0].scatter(bin_centers, bin_data['mean'], color='red')
    axes[1, 0].set_title("Reliability Diagram")
    axes[1, 0].set_xlabel("Predicted Confidence")
    axes[1, 0].set_ylabel("Observed Accuracy")
    axes[1, 0].legend()
    axes[1, 0].set_xlim(0, 1)
    axes[1, 0].set_ylim(0, 1)

    # Threshold curve
    thresholds = np.arange(0.5, 1.0, 0.05)
    accs = [df[df['confidence'] >= t]['correct'].mean() if len(df[df['confidence'] >= t]) > 0 else 0 for t in thresholds]
    counts = [len(df[df['confidence'] >= t]) for t in thresholds]

    ax = axes[1, 1]
    ax2 = ax.twinx()
    ax.plot(thresholds, accs, 'b-o', label='Accuracy')
    ax2.plot(thresholds, counts, 'r-s', label='Count')

    ax.set_xlabel("Confidence Threshold")
    ax.set_ylabel("Accuracy", color='b')
    ax2.set_ylabel("Count", color='r')
    ax.set_title("Accuracy and Count by Threshold")

    plt.tight_layout()
    output_path = f"{CONFIG['output_dir']}/plots/confidence_distribution_analysis.png"
    plt.savefig(output_path, dpi=CONFIG['plot_style']['dpi'], bbox_inches='tight')
    plt.close()
    return output_path


## Model Calibration Visualizations

In [7]:
def create_model_calibration_comparison(calibration_results, model_metrics=None):
    if not calibration_results:
        return None

    models = list(calibration_results.keys())
    sorted_models = sorted(calibration_results.items(), key=lambda x: x[1]['ece'])
    n = len(models)
    cols = min(3, n)
    rows = (n + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))

    if n == 1:
        axes = [axes]
    elif rows == 1:
        axes = axes
    else:
        axes = axes.flatten()

    for idx, (model, data) in enumerate(sorted_models):
        ax = axes[idx]
        ax.plot([0, 1], [0, 1], 'k--', label='Perfect')
        ax.plot(data['prob_pred'], data['prob_true'], 'o-', label=model.split('_config_')[0])
        ax.set_title(f"{model}\nECE: {data['ece']:.3f}, Brier: {data['brier_score']:.3f}")
        ax.set_xlim(0, 1)
        ax.set_ylim(0, 1)
        ax.legend()

    for idx in range(len(sorted_models), len(axes)):
        axes[idx].set_visible(False)

    plt.tight_layout()
    path = f"{CONFIG['output_dir']}/plots/model_calibration_comparison.png"
    plt.savefig(path, dpi=CONFIG['plot_style']['dpi'], bbox_inches='tight')
    plt.close()

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    names = [m.split('_config_')[0] for m in models]
    eces = [calibration_results[m]['ece'] for m in models]
    briers = [calibration_results[m]['brier_score'] for m in models]

    ax1.barh(names, eces, color='lightblue')
    ax1.set_title("Expected Calibration Error (Lower is Better)")
    ax2.barh(names, briers, color='salmon')
    ax2.set_title("Brier Score (Lower is Better)")

    plt.tight_layout()
    summary_path = f"{CONFIG['output_dir']}/plots/calibration_metrics_summary.png"
    plt.savefig(summary_path, dpi=CONFIG['plot_style']['dpi'], bbox_inches='tight')
    plt.close()

    return path, summary_path


## Per-Model Confidence Analysis

In [8]:
def create_per_model_confidence_analysis(df):
    models = df['run_name'].unique()
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Average confidence
    group_stats = df.groupby('run_name').agg({'confidence': ['mean', 'std'], 'correct': 'mean'})
    names = [m.split('_config_')[0] for m in group_stats.index]
    y_pos = np.arange(len(names))

    axes[0, 0].barh(y_pos, group_stats[('confidence', 'mean')], 
                    xerr=group_stats[('confidence', 'std')],
                    color='skyblue')
    axes[0, 0].set_title("Average Confidence")
    axes[0, 0].set_yticks(y_pos)
    axes[0, 0].set_yticklabels(names)

    # Confidence vs Accuracy
    axes[0, 1].scatter(group_stats[('confidence', 'mean')], group_stats[('correct', 'mean')])
    axes[0, 1].set_title("Confidence vs Accuracy")
    axes[0, 1].set_xlabel("Avg Confidence")
    axes[0, 1].set_ylabel("Accuracy")

    for i, name in enumerate(group_stats.index):
        short = name.split('_config_')[0]
        x = group_stats.loc[name, ('confidence', 'mean')]
        y = group_stats.loc[name, ('correct', 'mean')]
        axes[0, 1].annotate(short, (x, y), fontsize=8)

    # High confidence accuracy
    accs, counts = [], []
    for m in models:
        subset = df[df['run_name'] == m]
        high = subset[subset['confidence'] >= CONFIG['high_confidence_threshold']]
        accs.append(high['correct'].mean() if len(high) > 0 else 0)
        counts.append(len(high))

    axes[1, 0].barh(np.arange(len(models)), accs, color='lightgreen')
    axes[1, 0].set_yticks(np.arange(len(models)))
    axes[1, 0].set_yticklabels([m.split('_config_')[0] for m in models])
    axes[1, 0].set_title("High Confidence Accuracy")

    # Confidence distribution (top 5)
    top5 = df.groupby('run_name')['correct'].mean().nlargest(5).index
    for m in top5:
        label = m.split('_config_')[0]
        axes[1, 1].hist(df[df['run_name'] == m]['confidence'], bins=20, alpha=0.5, label=label, density=True)
    axes[1, 1].set_title("Confidence Distribution (Top 5)")
    axes[1, 1].legend(fontsize=8)

    plt.tight_layout()
    path = f"{CONFIG['output_dir']}/plots/per_model_confidence_analysis.png"
    plt.savefig(path, dpi=CONFIG['plot_style']['dpi'], bbox_inches='tight')
    plt.close()
    return path


## Report Generation

In [None]:
def create_confidence_analysis_report(df, confidence_analysis, calibration_results, problematic_examples):
    report = []
    report.append("# MODEL CONFIDENCE AND CALIBRATION ANALYSIS REPORT")

    # Executive Summary
    report.append("## Executive Summary")
    report.append(f"- Total Predictions: {len(df):,}")
    report.append(f"- Unique Models: {df['run_name'].nunique()}")
    report.append(f"- Overall Accuracy: {df['correct'].mean():.4f}")
    report.append(f"- Average Confidence: {df['confidence'].mean():.4f}")
    report.append(f"- Confidence-Accuracy Correlation: {confidence_analysis['overall_correlation']:.4f}\n")

    # Threshold Analysis
    report.append("## Confidence Threshold Analysis")
    for key, section in confidence_analysis['confidence_thresholds'].items():
        name = key.replace('_', ' ').title()
        report.append(f"### {name}")
        report.append(f"- Accuracy: {section['accuracy']:.4f}")
        report.append(f"- Predictions: {section['count']:,} ({section['percentage_of_total']:.1f}%)\n")

    # Calibration Rankings
    if calibration_results:
        report.append("## Model Calibration Summary (Sorted by ECE)")
        sorted_models = sorted(calibration_results.items(), key=lambda x: x[1]['ece'])
        for i, (model, res) in enumerate(sorted_models, 1):
            short = model.split('_config_')[0]
            report.append(f"{i}. **{short}**")
            report.append(f"   - ECE: {res['ece']:.4f}")
            report.append(f"   - Brier: {res['brier_score']:.4f}")
            report.append(f"   - Accuracy: {res['accuracy']:.4f}")
            report.append(f"   - Avg Confidence: {res['avg_confidence']:.4f}\n")

    # Save report
    report_path = f"{CONFIG['output_dir']}/reports/confidence_analysis_report.md"
    with open(report_path, "w") as f:
        f.write("\n".join(report))

    return report_path


## Main Analysis Pipeline

In [10]:
def main():
    print("=== CONFIDENCE AND CALIBRATION ANALYSIS ===")

    df = load_all_predictions(CONFIG["results_dir"])
    if df.empty:
        print("No prediction data found.")
        return None, None, None

    metrics = load_model_metrics(CONFIG["results_dir"])
    confidence_stats = analyze_confidence_accuracy_relationship(df)
    calibration = calculate_model_calibration(df)
    examples = find_problematic_predictions(df)

    # Plots
    create_confidence_distribution_plot(df)
    create_per_model_confidence_analysis(df)
    if calibration:
        create_model_calibration_comparison(calibration, metrics)

    # Report
    create_confidence_analysis_report(df, confidence_stats, calibration, examples)

    # Save detailed results
    df.to_csv(f"{CONFIG['output_dir']}/confidence_analysis_results.csv", index=False)
    if calibration:
        pd.DataFrame([
            {
                "model": m,
                "ece": res["ece"],
                "brier_score": res["brier_score"],
                "accuracy": res["accuracy"],
                "avg_confidence": res["avg_confidence"],
                "total_predictions": res["total_predictions"]
            } for m, res in calibration.items()
        ]).to_csv(f"{CONFIG['output_dir']}/model_calibration_summary.csv", index=False)

    return df, confidence_stats, calibration


## Run Analysis

In [11]:
if __name__ == "__main__":
    df_results, analysis, calibration = main()


=== CONFIDENCE AND CALIBRATION ANALYSIS ===
