# NB01: Study Overview

**Question:** What data do we have? What's broken? What's the rough performance landscape?

This notebook provides a high-level inventory of the `smart_retrieval_slm` study:
- Experiment health and completeness
- Coverage across models, datasets, agent types, and RAG components
- Overall performance landscape
- Metric correlation analysis (justifying F1 as primary metric)

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as scipy_stats

from analysis_utils import (
    load_all_results, setup_plotting, get_experiment_health_summary,
    print_search_space_summary, weighted_mean_with_ci,
    METRICS, PRIMARY_METRIC, MODEL_PARAMS, MODEL_TIER, BROKEN_MODELS,
)

setup_plotting()
STUDY_PATH = Path("../outputs/smart_retrieval_slm")

df = load_all_results(STUDY_PATH)
print(f"Loaded {len(df)} experiments")

## 1. Study Health

In [None]:
health = get_experiment_health_summary(STUDY_PATH)
print("Study Health")
print("=" * 50)
for k, v in health.items():
    print(f"  {k:<22s}: {v}")

# Check for missing metrics
print("\nMetric availability:")
for m in METRICS:
    if m in df.columns:
        n_valid = df[m].notna().sum()
        print(f"  {m:<15s}: {n_valid:>4d}/{len(df)} ({n_valid/len(df)*100:.0f}%)")

# Flag broken models
print("\nBroken models (near-zero F1):")
for model in sorted(BROKEN_MODELS):
    subset = df[df['model_short'] == model]
    if len(subset) > 0 and PRIMARY_METRIC in subset.columns:
        mean_f1 = subset[PRIMARY_METRIC].mean()
        print(f"  {model}: mean F1 = {mean_f1:.4f} ({len(subset)} experiments)")
    else:
        print(f"  {model}: no experiments loaded")

## 2. Coverage

In [None]:
print_search_space_summary(df)

# Model x Dataset crosstab heatmap
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Model x Dataset
ct_model_ds = pd.crosstab(df['model_short'], df['dataset'])
sns.heatmap(ct_model_ds, annot=True, fmt='d', cmap='YlGn', ax=axes[0])
axes[0].set_title('Experiments: Model x Dataset')

# Agent type x Dataset
ct_agent_ds = pd.crosstab(df['agent_type'], df['dataset'])
sns.heatmap(ct_agent_ds, annot=True, fmt='d', cmap='YlOrRd', ax=axes[1])
axes[1].set_title('Experiments: Agent Type x Dataset')

plt.tight_layout()
plt.show()

## 3. Performance Landscape

In [None]:
# Filter out broken models for performance analysis
df_clean = df[~df['model_short'].isin(BROKEN_MODELS)].copy()
print(f"After filtering broken models: {len(df_clean)} experiments")

# Model leaderboard
leaderboard = weighted_mean_with_ci(df_clean, 'model_short', PRIMARY_METRIC)
print("\nModel Leaderboard (F1):")
display(leaderboard)

# Bar chart with CI
fig, ax = plt.subplots(figsize=(12, 5))
x = range(len(leaderboard))
yerr_low = leaderboard['mean'] - leaderboard['ci_low']
yerr_high = leaderboard['ci_high'] - leaderboard['mean']
colors = [sns.color_palette('husl', len(leaderboard))[i] for i in range(len(leaderboard))]
ax.bar(x, leaderboard['mean'], yerr=[yerr_low, yerr_high], capsize=5,
       color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
ax.set_xticks(x)
ax.set_xticklabels(leaderboard['model_short'], rotation=30, ha='right')
ax.set_ylabel('Mean F1')
ax.set_title('Model Leaderboard — Mean F1 with 95% CI')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Dataset violin plots of F1 distribution
fig, ax = plt.subplots(figsize=(10, 5))
datasets = sorted(df_clean['dataset'].unique())
data_for_violin = [df_clean[df_clean['dataset'] == ds][PRIMARY_METRIC].dropna().values
                   for ds in datasets]
parts = ax.violinplot(data_for_violin, showmeans=True, showmedians=True)
ax.set_xticks(range(1, len(datasets) + 1))
ax.set_xticklabels(datasets)
ax.set_ylabel('F1')
ax.set_title('F1 Distribution by Dataset')
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Metric Correlation

Justification for using F1 as the primary metric: if F1, exact_match, bertscore, and bleurt are highly correlated, conclusions drawn from F1 generalize.

In [None]:
# Compute Spearman correlations across all available metrics
available = [m for m in METRICS if m in df_clean.columns and df_clean[m].notna().sum() >= 10]
print(f"Metrics with sufficient data: {available}")

if len(available) >= 2:
    metric_df = df_clean[available].dropna()
    corr_matrix = metric_df.corr(method='spearman')

    fig, ax = plt.subplots(figsize=(8, 6))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdYlGn',
                vmin=-1, vmax=1, mask=mask, ax=ax, square=True)
    ax.set_title('Spearman Correlation Between Metrics')
    plt.tight_layout()
    plt.show()

    # Print pairwise correlations with F1
    print("\nCorrelation with F1:")
    for m in available:
        if m != PRIMARY_METRIC:
            rho, pval = scipy_stats.spearmanr(
                metric_df[PRIMARY_METRIC], metric_df[m]
            )
            print(f"  {m:<15s}: rho={rho:.3f}, p={pval:.2e}")
else:
    print("Not enough metrics with data for correlation analysis.")

## 5. Summary

Key takeaways from this overview — fill in after running on real data.

In [None]:
print("STUDY OVERVIEW SUMMARY")
print("=" * 60)
print(f"Total experiments (loaded):   {len(df)}")
print(f"After broken-model filter:    {len(df_clean)}")
print(f"Models:                       {sorted(df['model_short'].unique())}")
print(f"Datasets:                     {sorted(df['dataset'].unique())}")
print(f"Agent types:                  {sorted(df['agent_type'].unique())}")
print(f"Experiment types:             {df['exp_type'].value_counts().to_dict()}")
if PRIMARY_METRIC in df_clean.columns:
    print(f"\nOverall F1 (clean):  mean={df_clean[PRIMARY_METRIC].mean():.4f}, "
          f"std={df_clean[PRIMARY_METRIC].std():.4f}")