# NB02: RAG vs Direct

**Question:** Does retrieval help? When? By how much?

This notebook compares RAG-augmented configurations against direct (no-retrieval) baselines:
- Overall effect size
- Distribution of RAG benefit (helps / hurts / neutral)
- RAG uplift by model tier and dataset
- Success factors for RAG

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from analysis_utils import (
    load_all_results, setup_plotting, effect_size,
    compare_best_rag_vs_direct, analyze_rag_benefit_distribution,
    identify_rag_success_factors, plot_rag_benefit_distribution,
    PRIMARY_METRIC, MODEL_PARAMS, MODEL_TIER, BROKEN_MODELS,
)

setup_plotting()
STUDY_PATH = Path("../outputs/smart_retrieval_slm")

df_all = load_all_results(STUDY_PATH)
df = df_all[~df_all['model_short'].isin(BROKEN_MODELS)].copy()
print(f"Loaded {len(df_all)} total, {len(df)} after removing broken models")

## 1. Overall RAG vs Direct Effect Size

In [None]:
# Per-dataset effect sizes, then combined
print("Overall RAG vs Direct (dataset-stratified)")
print("=" * 60)

datasets = sorted(df['dataset'].unique())
per_ds_results = []

for ds in datasets:
    d_vals = df.loc[(df['exp_type'] == 'direct') & (df['dataset'] == ds), PRIMARY_METRIC].dropna().values
    r_vals = df.loc[(df['exp_type'] == 'rag') & (df['dataset'] == ds), PRIMARY_METRIC].dropna().values
    if len(d_vals) >= 2 and len(r_vals) >= 2:
        d_es, p, interp_ds = effect_size(d_vals, r_vals)
        per_ds_results.append({
            'dataset': ds, 'direct_mean': np.mean(d_vals), 'rag_mean': np.mean(r_vals),
            'n_direct': len(d_vals), 'n_rag': len(r_vals),
            'cohens_d': d_es, 'p_value': p, 'interpretation': interp_ds,
        })
        print(f"  {ds}:")
        print(f"    Direct: mean={np.mean(d_vals):.4f}, n={len(d_vals)}")
        print(f"    RAG:    mean={np.mean(r_vals):.4f}, n={len(r_vals)}")
        print(f"    Cohen's d = {d_es:.3f} ({interp_ds}), p = {p:.2e}")

# Combined: average per-dataset means (stratified)
if per_ds_results:
    combined_direct = np.mean([r['direct_mean'] for r in per_ds_results])
    combined_rag = np.mean([r['rag_mean'] for r in per_ds_results])
    combined_d = np.mean([r['cohens_d'] for r in per_ds_results])
    print(f"\n  Combined (stratified mean across {len(per_ds_results)} datasets):")
    print(f"    Direct: mean={combined_direct:.4f}")
    print(f"    RAG:    mean={combined_rag:.4f}")
    print(f"    Mean Cohen's d = {combined_d:.3f}")

    # Store for summary cell
    d = combined_d
    interp = 'large' if abs(d) >= 0.8 else 'medium' if abs(d) >= 0.5 else 'small' if abs(d) >= 0.2 else 'negligible'

In [None]:
# Best-RAG vs Direct per model+dataset
comparison = compare_best_rag_vs_direct(df, PRIMARY_METRIC, top_k=3)
print("Best-3 RAG vs Direct Baseline (per model x dataset):")
display(comparison.round(4))

# Grouped bar chart
if 'model_short' in comparison.columns and 'dataset' in comparison.columns:
    models = sorted(comparison['model_short'].unique())
    datasets = sorted(comparison['dataset'].unique())

    fig, axes = plt.subplots(1, len(datasets), figsize=(6 * len(datasets), 5), sharey=True)
    if len(datasets) == 1:
        axes = [axes]

    for ax, ds in zip(axes, datasets):
        sub = comparison[comparison['dataset'] == ds].sort_values('model_short')
        x = np.arange(len(sub))
        w = 0.35
        ax.bar(x - w/2, sub['direct_mean'], w, label='Direct', color='steelblue', alpha=0.8)
        ax.bar(x + w/2, sub['top_rag_mean'], w, label='Best RAG', color='coral', alpha=0.8)
        ax.set_xticks(x)
        ax.set_xticklabels(sub['model_short'], rotation=30, ha='right')
        ax.set_title(ds)
        ax.set_ylabel('F1' if ax == axes[0] else '')
        ax.legend()
        ax.grid(axis='y', alpha=0.3)

    plt.suptitle('Direct vs Best-3 RAG by Model and Dataset', y=1.02)
    plt.tight_layout()
    plt.show()

## 2. RAG Benefit Distribution

In [None]:
benefit = analyze_rag_benefit_distribution(df, PRIMARY_METRIC)

if benefit:
    print("RAG Benefit Distribution")
    print("=" * 50)
    print(f"  Helps:   {benefit['n_helps']:>4d} ({benefit['pct_helps']:.1f}%)")
    print(f"  Hurts:   {benefit['n_hurts']:>4d} ({benefit['pct_hurts']:.1f}%)")
    print(f"  Neutral: {benefit['n_neutral']:>4d}")
    print(f"  Mean uplift when helps:  {benefit['mean_benefit_when_helps']:.4f}")
    print(f"  Mean loss when hurts:    {benefit['mean_hurt_when_hurts']:.4f}")

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Histogram
    plot_rag_benefit_distribution(benefit, ax=axes[0])

    # Pie chart
    sizes = [benefit['n_helps'], benefit['n_hurts'], benefit['n_neutral']]
    labels = ['Helps', 'Hurts', 'Neutral']
    colors_pie = ['#66bb6a', '#ef5350', '#bdbdbd']
    axes[1].pie(sizes, labels=labels, colors=colors_pie, autopct='%1.0f%%',
               startangle=90, textprops={'fontsize': 12})
    axes[1].set_title('RAG Outcome Distribution')

    plt.tight_layout()
    plt.show()

In [None]:
# Box plots of RAG benefit by model
if benefit:
    rag_df = benefit['rag_df']
    models = sorted(rag_df['model_short'].unique())

    fig, ax = plt.subplots(figsize=(12, 5))
    data_by_model = [rag_df[rag_df['model_short'] == m]['rag_benefit'].dropna().values
                     for m in models]
    bp = ax.boxplot(data_by_model, labels=models, patch_artist=True)
    for patch in bp['boxes']:
        patch.set_facecolor('lightblue')
    ax.axhline(y=0, color='red', linestyle='--', linewidth=1, label='Break-even')
    ax.set_ylabel('RAG Benefit (F1 delta)')
    ax.set_title('RAG Benefit Distribution by Model')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=30, ha='right')
    plt.tight_layout()
    plt.show()

## 3. RAG Uplift by Model Tier

Does RAG help weaker models more? Scatter: x = direct baseline F1, y = RAG uplift.

In [None]:
# Compute per-model direct baseline and best-RAG (dataset-stratified)
direct_df = df[df['exp_type'] == 'direct']
rag_df = df[df['exp_type'] == 'rag']

# Stratified: per-(model, dataset) means, then average across datasets
model_direct_strat = (
    direct_df.groupby(['model_short', 'dataset'])[PRIMARY_METRIC].mean()
    .groupby('model_short').mean()
)
model_best_rag_strat = (
    rag_df.groupby(['model_short', 'dataset'])[PRIMARY_METRIC].max()
    .groupby('model_short').mean()
)

scatter_data = pd.DataFrame({
    'direct_f1': model_direct_strat,
    'best_rag_f1': model_best_rag_strat,
}).dropna()
scatter_data['rag_uplift'] = scatter_data['best_rag_f1'] - scatter_data['direct_f1']
scatter_data['tier'] = scatter_data.index.map(lambda m: MODEL_TIER.get(m, 'unknown'))
scatter_data['params_b'] = scatter_data.index.map(lambda m: MODEL_PARAMS.get(m, np.nan))

tier_colors = {'tiny': '#ef5350', 'small': '#ffa726', 'medium': '#66bb6a'}

fig, ax = plt.subplots(figsize=(10, 6))
for tier, color in tier_colors.items():
    sub = scatter_data[scatter_data['tier'] == tier]
    ax.scatter(sub['direct_f1'], sub['rag_uplift'], c=color, s=120,
              label=tier.capitalize(), edgecolors='black', zorder=3)
    for model, row in sub.iterrows():
        ax.annotate(model, (row['direct_f1'], row['rag_uplift']),
                    textcoords='offset points', xytext=(8, 4), fontsize=9)

ax.axhline(y=0, color='grey', linestyle='--', alpha=0.5)
ax.set_xlabel('Direct Baseline F1 (dataset-stratified)')
ax.set_ylabel('Best-RAG Uplift (F1 delta, dataset-stratified)')
ax.set_title('RAG Uplift vs Direct Baseline by Model Tier (Dataset-Stratified)')
ax.legend(title='Tier')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

display(scatter_data.round(4))

## 4. RAG Benefit by Dataset

In [None]:
# Per-dataset effect sizes (direct vs all RAG)
datasets = sorted(df['dataset'].unique())
ds_effects = []
for ds in datasets:
    d_vals = df.loc[(df['exp_type'] == 'direct') & (df['dataset'] == ds), PRIMARY_METRIC].dropna().values
    r_vals = df.loc[(df['exp_type'] == 'rag') & (df['dataset'] == ds), PRIMARY_METRIC].dropna().values
    if len(d_vals) >= 2 and len(r_vals) >= 2:
        d, pval, interp = effect_size(d_vals, r_vals)
        ds_effects.append({
            'dataset': ds, 'direct_mean': np.mean(d_vals), 'rag_mean': np.mean(r_vals),
            'cohens_d': d, 'p_value': pval, 'interpretation': interp,
        })

ds_effect_df = pd.DataFrame(ds_effects)
display(ds_effect_df.round(4))

# Grouped bars
fig, ax = plt.subplots(figsize=(8, 5))
x = np.arange(len(ds_effect_df))
w = 0.35
ax.bar(x - w/2, ds_effect_df['direct_mean'], w, label='Direct', color='steelblue', alpha=0.8)
ax.bar(x + w/2, ds_effect_df['rag_mean'], w, label='RAG (mean)', color='coral', alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels(ds_effect_df['dataset'])
ax.set_ylabel('F1')
ax.set_title('Direct vs RAG by Dataset')
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Success Factors for RAG

In [None]:
success_factors = identify_rag_success_factors(df, PRIMARY_METRIC)

if success_factors:
    for factor, table in success_factors.items():
        if table.empty:
            continue
        print(f"\n{factor}:")
        display(table.round(3))

## 6. Summary

Key takeaways:
- Overall RAG vs Direct effect size and significance
- What fraction of RAG configs help vs hurt
- Which model tiers benefit most from RAG
- Dataset-specific patterns

In [None]:
if benefit:
    print("RAG vs DIRECT SUMMARY")
    print("=" * 60)
    print(f"Overall Cohen's d:    {d:.3f} ({interp})")
    print(f"RAG helps in:         {benefit['pct_helps']:.0f}% of configurations")
    print(f"Best RAG uplift:      {benefit['best_rag_benefit']:.4f}")
    print(f"Worst RAG penalty:    {benefit['worst_rag_benefit']:.4f}")