In [None]:
# Setup
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss

# Add repo root
repo_root = Path().resolve().parents[2]
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from

# Import utilities
set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
reliability_diagram, expected_calibration_error, TemperatureScaling = safe_import_from(
    '02_stat_inference_uq.src.calibration',
    'reliability_diagram', 'expected_calibration_error', 'TemperatureScaling'
)

set_seed(42)

reports_dir = Path("../reports")
reports_dir.mkdir(exist_ok=True)

print("‚úÖ Setup complete")

## 1. Intuition: What is Calibration?

**Definition:** A probabilistic classifier is **calibrated** if its predicted probabilities match empirical frequencies.

**Example:**
- If a model predicts 80% confidence for 100 predictions, ~80 should be correct
- **Well-calibrated**: Among predictions with confidence 0.7, exactly 70% are correct
- **Overconfident**: Among predictions with confidence 0.9, only 70% are correct
- **Underconfident**: Among predictions with confidence 0.6, actually 80% are correct

**Why it matters:**
- **Decision-making**: If a medical model says "95% chance of disease", you want that to be accurate!
- **Cost-sensitive applications**: Need reliable probabilities to set thresholds
- **Modern deep networks are often overconfident** (high accuracy but poor calibration)

**Key insight:** High accuracy ‚â† good calibration!
- A model can be 95% accurate but severely miscalibrated
- Temperature scaling fixes calibration without changing predictions

## 2. Minimal Math: Calibration Metrics

### Reliability Diagram
- **Bin predictions** by confidence: $[0.0, 0.1), [0.1, 0.2), ..., [0.9, 1.0]$
- For bin $B_m$: 
  - Mean predicted probability: $\bar{p}_m = \frac{1}{|B_m|} \sum_{i \in B_m} p_i$
  - Empirical accuracy: $\bar{y}_m = \frac{1}{|B_m|} \sum_{i \in B_m} y_i$
- **Perfect calibration**: $\bar{p}_m = \bar{y}_m$ for all bins

### Expected Calibration Error (ECE)
$$\text{ECE} = \sum_{m=1}^M \frac{|B_m|}{n} \left| \bar{p}_m - \bar{y}_m \right|$$

**Interpretation:** Weighted average calibration gap across bins

### Temperature Scaling
- **Pre-calibration logits**: $z$ (model outputs before softmax)
- **Post-calibration**: $p_i = \frac{\exp(z_i / T)}{\sum_j \exp(z_j / T)}$
- **Temperature $T$**: Learned on validation set to minimize NLL
  - $T > 1$ ‚Üí softer probabilities (less confident)
  - $T < 1$ ‚Üí sharper probabilities (more confident)
  - $T = 1$ ‚Üí original probabilities

**Key property:** Temperature scaling **preserves accuracy** (argmax unchanged)

## 3. Implementation: Generate Overconfident Classifier

In [None]:
# Generate binary classification dataset
X, y = make_classification(
    n_samples=2000,
    n_features=20,
    n_informative=15,
    n_redundant=3,
    n_classes=2,
    flip_y=0.05,  # 5% label noise
    random_state=42
)

# Split: train / validation (for temp scaling) / test
X_train, X_temp_test, y_train, y_temp_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp_test, y_temp_test, test_size=0.5, random_state=42
)

print(f"Dataset sizes: train={len(y_train)}, val={len(y_val)}, test={len(y_test)}")

In [None]:
# Train a model that tends to be overconfident (small neural network)
# We'll use a shallow MLP with L2 regularization that produces overconfident predictions
model = MLPClassifier(
    hidden_layer_sizes=(50, 25),
    activation='relu',
    max_iter=500,
    alpha=0.001,  # Small L2 penalty
    random_state=42
)

model.fit(X_train, y_train)

# Get predicted probabilities
y_prob_train = model.predict_proba(X_train)[:, 1]
y_prob_val = model.predict_proba(X_val)[:, 1]
y_prob_test = model.predict_proba(X_test)[:, 1]

# Evaluate accuracy
acc_test = accuracy_score(y_test, model.predict(X_test))
print(f"\nTest accuracy: {acc_test:.1%}")
print(f"Average predicted probability: {y_prob_test.mean():.3f}")
print(f"Actual positive rate: {y_test.mean():.3f}")

## 4. Experiments: Diagnosing Miscalibration

In [None]:
# Experiment 1: Reliability diagram BEFORE calibration
fig, ax = plt.subplots(figsize=(10, 8))

bin_centers, bin_accs, bin_counts = reliability_diagram(
    y_test, y_prob_test, n_bins=10, strategy='uniform', ax=ax
)

ax.set_title('Reliability Diagram (BEFORE Temperature Scaling)', fontsize=14, weight='bold')

plt.tight_layout()
plt.savefig(reports_dir / '03_reliability_before.png', dpi=150, bbox_inches='tight')
plt.show()

# Compute ECE
ece_before = expected_calibration_error(y_test, y_prob_test, n_bins=10)
print(f"\n‚úÖ ECE before calibration: {ece_before:.4f}")
print(f"   (Lower is better; 0 = perfect calibration)")
print("\nüìä Observation: Bars below diagonal => overconfident predictions")

In [None]:
# Experiment 2: Apply temperature scaling
temp_scaler = TemperatureScaling()

# Fit temperature on validation set (NOT test set!)
# Need to get logits, not probabilities
# For sklearn, we'll work with probabilities and approximate
import warnings
warnings.filterwarnings('ignore')

# Simple temperature scaling for binary case
# We'll use a custom implementation since sklearn doesn't expose logits easily
from scipy.optimize import minimize_scalar

def calibrate_binary(y_true, y_prob):
    """Find optimal temperature for binary classification."""
    def nll(T):
        # Clip to avoid log(0)
        p_calibrated = np.clip(y_prob ** (1/T) / (y_prob**(1/T) + (1-y_prob)**(1/T)), 1e-7, 1-1e-7)
        return -np.mean(y_true * np.log(p_calibrated) + (1-y_true) * np.log(1-p_calibrated))
    
    result = minimize_scalar(nll, bounds=(0.1, 10.0), method='bounded')
    return result.x

# Find optimal temperature
T_opt = calibrate_binary(y_val, y_prob_val)
print(f"Optimal temperature: T = {T_opt:.3f}")
print(f"   T > 1 means model was overconfident (soften predictions)")
print(f"   T < 1 means model was underconfident (sharpen predictions)")

# Apply calibration to test set
y_prob_calibrated = np.clip(
    y_prob_test ** (1/T_opt) / (y_prob_test**(1/T_opt) + (1-y_prob_test)**(1/T_opt)),
    1e-7, 1-1e-7
)

print(f"\nAverage probability before: {y_prob_test.mean():.3f}")
print(f"Average probability after: {y_prob_calibrated.mean():.3f}")
print(f"Actual positive rate: {y_test.mean():.3f}")

In [None]:
# Experiment 3: Reliability diagram AFTER calibration
fig, ax = plt.subplots(figsize=(10, 8))

bin_centers_cal, bin_accs_cal, bin_counts_cal = reliability_diagram(
    y_test, y_prob_calibrated, n_bins=10, strategy='uniform', ax=ax
)

ax.set_title('Reliability Diagram (AFTER Temperature Scaling)', fontsize=14, weight='bold')

plt.tight_layout()
plt.savefig(reports_dir / '03_reliability_after.png', dpi=150, bbox_inches='tight')
plt.show()

ece_after = expected_calibration_error(y_test, y_prob_calibrated, n_bins=10)
print(f"\n‚úÖ ECE after calibration: {ece_after:.4f}")
print(f"   Improvement: {(ece_before - ece_after)/ece_before * 100:.1f}% reduction")
print("\nüìä Observation: Bars now closer to diagonal => better calibrated")

In [None]:
# Experiment 4: Side-by-side comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Before
reliability_diagram(y_test, y_prob_test, n_bins=10, strategy='uniform', ax=ax1)
ax1.set_title(f'BEFORE (ECE={ece_before:.4f})', fontsize=14, weight='bold')

# After
reliability_diagram(y_test, y_prob_calibrated, n_bins=10, strategy='uniform', ax=ax2)
ax2.set_title(f'AFTER (ECE={ece_after:.4f})', fontsize=14, weight='bold')

plt.tight_layout()
plt.savefig(reports_dir / '03_calibration_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Saved: reports/03_calibration_comparison.png")

In [None]:
# Experiment 5: Comprehensive metrics comparison
def compute_metrics(y_true, y_prob, y_pred):
    """Compute accuracy, NLL, Brier, ECE."""
    acc = accuracy_score(y_true, y_pred)
    # NLL (negative log-likelihood)
    y_prob_clipped = np.clip(y_prob, 1e-7, 1-1e-7)
    nll = -np.mean(y_true * np.log(y_prob_clipped) + (1-y_true) * np.log(1-y_prob_clipped))
    # Brier score
    brier = np.mean((y_prob - y_true)**2)
    # ECE
    ece = expected_calibration_error(y_true, y_prob, n_bins=10)
    return {'accuracy': acc, 'nll': nll, 'brier': brier, 'ece': ece}

# Before calibration
y_pred_before = (y_prob_test >= 0.5).astype(int)
metrics_before = compute_metrics(y_test, y_prob_test, y_pred_before)

# After calibration
y_pred_after = (y_prob_calibrated >= 0.5).astype(int)
metrics_after = compute_metrics(y_test, y_prob_calibrated, y_pred_after)

# Print comparison table
print("\n" + "="*60)
print("METRICS COMPARISON: BEFORE vs AFTER CALIBRATION")
print("="*60)
print(f"{'Metric':<20} {'Before':>15} {'After':>15} {'Change':>10}")
print("-"*60)

for metric in ['accuracy', 'nll', 'brier', 'ece']:
    before = metrics_before[metric]
    after = metrics_after[metric]
    change = after - before
    symbol = '‚Üì' if change < 0 else ('‚Üë' if change > 0 else '=')
    print(f"{metric.upper():<20} {before:>15.4f} {after:>15.4f} {symbol:>2} {abs(change):>7.4f}")

print("-"*60)
print("Key observations:")
print("  - Accuracy UNCHANGED (temperature scaling preserves argmax)")
print("  - NLL IMPROVED (better probability estimates)")
print("  - Brier IMPROVED (closer to true probabilities)")
print("  - ECE IMPROVED (better calibrated)")
print("="*60)

# Save metrics to file
import json
metrics_dict = {
    'before': {k: float(v) for k, v in metrics_before.items()},
    'after': {k: float(v) for k, v in metrics_after.items()},
    'temperature': float(T_opt)
}
with open(reports_dir / '03_calibration_metrics.json', 'w') as f:
    json.dump(metrics_dict, f, indent=2)

print("\n‚úÖ Saved: reports/03_calibration_metrics.json")

In [None]:
# Experiment 6: Threshold decision impact
# Show how calibration affects decisions at different thresholds

thresholds = np.linspace(0.1, 0.9, 17)
precision_before = []
recall_before = []
precision_after = []
recall_after = []

for threshold in thresholds:
    # Before
    y_pred_th_before = (y_prob_test >= threshold).astype(int)
    tp_before = np.sum((y_pred_th_before == 1) & (y_test == 1))
    fp_before = np.sum((y_pred_th_before == 1) & (y_test == 0))
    fn_before = np.sum((y_pred_th_before == 0) & (y_test == 1))
    
    prec_before = tp_before / (tp_before + fp_before) if (tp_before + fp_before) > 0 else 0
    rec_before = tp_before / (tp_before + fn_before) if (tp_before + fn_before) > 0 else 0
    precision_before.append(prec_before)
    recall_before.append(rec_before)
    
    # After
    y_pred_th_after = (y_prob_calibrated >= threshold).astype(int)
    tp_after = np.sum((y_pred_th_after == 1) & (y_test == 1))
    fp_after = np.sum((y_pred_th_after == 1) & (y_test == 0))
    fn_after = np.sum((y_pred_th_after == 0) & (y_test == 1))
    
    prec_after = tp_after / (tp_after + fp_after) if (tp_after + fp_after) > 0 else 0
    rec_after = tp_after / (tp_after + fn_after) if (tp_after + fn_after) > 0 else 0
    precision_after.append(prec_after)
    recall_after.append(rec_after)

# Plot precision-recall curves
fig, ax = plt.subplots(figsize=(10, 7))

ax.plot(recall_before, precision_before, 'o-', linewidth=2, markersize=6,
        label='Before calibration', alpha=0.8)
ax.plot(recall_after, precision_after, 's-', linewidth=2, markersize=6,
        label='After calibration', alpha=0.8)

ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall at Different Thresholds', fontsize=14)
ax.legend(fontsize=11)
ax.grid(alpha=0.3)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(reports_dir / '03_threshold_decisions.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Saved: reports/03_threshold_decisions.png")
print("\nüìä Insight: Calibrated probabilities lead to more reliable threshold tuning")

## 5. Sanity Checks

In [None]:
# Sanity check 1: Temperature > 1 for overconfident models
print("Sanity Check 1: Temperature value")
print(f"   T = {T_opt:.3f}")
if T_opt > 1.0:
    print("   ‚úÖ T > 1: Model was overconfident (as expected for neural networks)")
elif T_opt < 1.0:
    print("   ‚ö†Ô∏è T < 1: Model was underconfident (unusual)")
else:
    print("   T = 1: Model was already well-calibrated")

# Sanity check 2: ECE should decrease
print("\nSanity Check 2: ECE reduction")
print(f"   ECE before: {ece_before:.4f}")
print(f"   ECE after: {ece_after:.4f}")
print(f"   ‚úÖ PASSED" if ece_after < ece_before else "   ‚ùå FAILED")

# Sanity check 3: Accuracy should be approximately preserved
print("\nSanity Check 3: Accuracy preservation")
print(f"   Accuracy before: {metrics_before['accuracy']:.4f}")
print(f"   Accuracy after: {metrics_after['accuracy']:.4f}")
print(f"   Difference: {abs(metrics_after['accuracy'] - metrics_before['accuracy']):.4f}")
acc_preserved = np.isclose(metrics_before['accuracy'], metrics_after['accuracy'], atol=0.01)
print(f"   ‚úÖ PASSED (accuracy preserved)" if acc_preserved else "   ‚ö†Ô∏è Small change")

# Sanity check 4: Perfect calibration should have ECE ‚âà 0
print("\nSanity Check 4: Perfect calibration test")
# Create perfectly calibrated predictions
y_perfect = np.random.rand(1000)
y_labels = (np.random.rand(1000) < y_perfect).astype(int)
ece_perfect = expected_calibration_error(y_labels, y_perfect, n_bins=10)
print(f"   ECE for perfectly calibrated data: {ece_perfect:.4f}")
print(f"   ‚úÖ PASSED (ECE ‚âà 0)" if ece_perfect < 0.05 else "   ‚ö†Ô∏è Check implementation")

## 6. Key Takeaways

‚úÖ **Calibration**: Predicted probabilities should match empirical frequencies

‚úÖ **Modern neural networks are often overconfident** ‚Üí need post-hoc calibration

‚úÖ **Reliability diagram**: Visual tool to diagnose miscalibration
   - Perfect calibration: bars align with diagonal
   - Below diagonal: overconfident
   - Above diagonal: underconfident

‚úÖ **ECE quantifies miscalibration**: Lower is better (0 = perfect)

‚úÖ **Temperature scaling**: Simple, effective post-hoc calibration
   - $T > 1$ ‚Üí soften overconfident predictions
   - Preserves accuracy (argmax unchanged)
   - Only requires validation set

‚úÖ **Metrics improve**: NLL, Brier score, ECE all benefit from calibration

**Common pitfalls:**
- ‚ùå Confusing accuracy with calibration (can have high accuracy but poor calibration)
- ‚ùå Using test set to fit temperature (causes overfitting! use separate validation set)
- ‚ùå Applying calibration when not needed (simple models like logistic regression are often well-calibrated)
- ‚ùå Forgetting that calibration doesn't improve discrimination (ROC-AUC unchanged)

## 7. Exercises

**Exercise 1:** Train a very overconfident model (e.g., increase hidden layer sizes, reduce regularization). What is the optimal temperature?

**Exercise 2:** Implement Maximum Calibration Error (MCE): $\max_m |\bar{p}_m - \bar{y}_m|$. How does it differ from ECE?

**Exercise 3:** Create artificially underconfident predictions by adding noise to probabilities. Verify that $T < 1$ after calibration.

**Exercise 4:** Compare temperature scaling with Platt scaling (logistic regression on top of model scores). Which is simpler? Which works better?

**Exercise 5:** For a multiclass problem (3+ classes), extend temperature scaling. What changes?

**Exercise 6:** Plot Brier score decomposition: $\text{Brier} = \text{Reliability} + \text{Resolution} - \text{Uncertainty}$. How does calibration affect each term?

In [None]:
# Your solutions here

---

## Solutions

In [None]:
# Solution 1: Very overconfident model
set_seed(42)
overconfident_model = MLPClassifier(
    hidden_layer_sizes=(200, 100, 50),  # Larger network
    activation='relu',
    max_iter=500,
    alpha=0.0001,  # Very small regularization
    random_state=42
)
overconfident_model.fit(X_train, y_train)
y_prob_overconf_val = overconfident_model.predict_proba(X_val)[:, 1]
T_overconf = calibrate_binary(y_val, y_prob_overconf_val)
print(f"Solution 1: Overconfident model optimal T = {T_overconf:.3f}")
print(f"   Even higher T than before (model is more overconfident)")

# Solution 2: Maximum Calibration Error (MCE)
def maximum_calibration_error(y_true, y_prob, n_bins=10):
    """Compute MCE: maximum calibration gap."""
    bins = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(y_prob, bins[1:-1])
    
    max_gap = 0.0
    for i in range(n_bins):
        mask = bin_indices == i
        if np.sum(mask) > 0:
            bin_acc = np.mean(y_true[mask])
            bin_conf = np.mean(y_prob[mask])
            gap = abs(bin_conf - bin_acc)
            max_gap = max(max_gap, gap)
    return max_gap

mce_before = maximum_calibration_error(y_test, y_prob_test, n_bins=10)
mce_after = maximum_calibration_error(y_test, y_prob_calibrated, n_bins=10)
print(f"\nSolution 2: MCE comparison")
print(f"   MCE before: {mce_before:.4f}")
print(f"   MCE after: {mce_after:.4f}")
print(f"   Difference: MCE focuses on worst bin, ECE averages all bins")

# Solution 3: Underconfident predictions
# Add noise to pull predictions toward 0.5
y_prob_underconf = 0.5 + 0.3 * (y_prob_val - 0.5)  # Shrink toward 0.5
T_underconf = calibrate_binary(y_val, y_prob_underconf)
print(f"\nSolution 3: Underconfident T = {T_underconf:.3f}")
print(f"   T < 1 as expected (need to sharpen predictions)")

# Solution 5: Multiclass temperature scaling
print("\nSolution 5: Multiclass extension")
print("   For K classes with logits z_i:")
print("   p_i = exp(z_i/T) / sum_j exp(z_j/T)")
print("   Single scalar T applied to all logits")
print("   Fit T by minimizing negative log-likelihood on validation set")
print("   Preserves class rankings (argmax unchanged)")

---

**Next:** [04_mcmc_metropolis_hastings_diagnostics.ipynb](04_mcmc_metropolis_hastings_diagnostics.ipynb) - Learn how to sample from complex distributions using MCMC