# NB05: Model Scaling

**Question:** Can small models + premium retrieval match larger models without retrieval?

This notebook explores the scaling relationship:
- Direct performance vs model size
- RAG uplift vs model size
- Whether small+RAG can compensate for model size
- What matters most for small vs medium models

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as scipy_stats

from analysis_utils import (
    load_all_results, setup_plotting, identify_bottlenecks,
    weighted_mean_with_ci, multi_metric_bottlenecks_df,
    PRIMARY_METRIC, MODEL_PARAMS, MODEL_TIER, BROKEN_MODELS,
    MULTI_METRIC_SET, GROUNDEDNESS_METRICS, CONTEXT_METRICS,
)

setup_plotting()
STUDY_PATH = Path("../outputs/smart_retrieval_slm")

df_all = load_all_results(STUDY_PATH)
df = df_all[~df_all['model_short'].isin(BROKEN_MODELS)].copy()

# Add parameter count and tier columns
df['params_b'] = df['model_short'].map(MODEL_PARAMS)
df['tier'] = df['model_short'].map(MODEL_TIER)
print(f"Loaded {len(df)} experiments")
print(f"Models with params: {df['params_b'].notna().sum()}")
print(f"Tiers: {df['tier'].value_counts().to_dict()}")

# Check available metrics
available_metrics = [m for m in MULTI_METRIC_SET if m in df.columns and df[m].notna().sum() >= 10]
print(f"\nAvailable metrics: {available_metrics}")

## 1. Direct Scaling Curve

F1 vs model parameters (log scale) for direct (no-retrieval) experiments.

In [None]:
direct = df[df['exp_type'] == 'direct'].dropna(subset=['params_b', PRIMARY_METRIC])
model_direct = direct.groupby('model_short').agg(
    mean_f1=(PRIMARY_METRIC, 'mean'),
    std_f1=(PRIMARY_METRIC, 'std'),
    n=(PRIMARY_METRIC, 'count'),
    params_b=('params_b', 'first'),
    tier=('tier', 'first'),
).reset_index()

tier_colors = {'tiny': '#ef5350', 'small': '#ffa726', 'medium': '#66bb6a'}

fig, ax = plt.subplots(figsize=(10, 6))

for tier, color in tier_colors.items():
    sub = model_direct[model_direct['tier'] == tier]
    ax.scatter(sub['params_b'], sub['mean_f1'], c=color, s=120,
              label=tier.capitalize(), edgecolors='black', zorder=3)
    for _, row in sub.iterrows():
        ax.annotate(row['model_short'], (row['params_b'], row['mean_f1']),
                    textcoords='offset points', xytext=(8, 4), fontsize=9)

# Log-linear regression
if len(model_direct) >= 3:
    log_params = np.log10(model_direct['params_b'])
    slope, intercept, r, p, se = scipy_stats.linregress(log_params, model_direct['mean_f1'])
    x_fit = np.linspace(log_params.min() - 0.1, log_params.max() + 0.1, 50)
    ax.plot(10**x_fit, slope * x_fit + intercept, '--', color='grey', alpha=0.6,
            label=f'Log-linear fit (R²={r**2:.2f})')

ax.set_xscale('log')
ax.set_xlabel('Parameters (Billions, log scale)')
ax.set_ylabel('Mean F1 (Direct)')
ax.set_title('Direct Performance Scaling Curve')
ax.legend(title='Tier')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 2. RAG Uplift by Model Size

Overlay best-RAG curve on direct curve — the key thesis figure.

In [None]:
rag_df = df[df['exp_type'] == 'rag'].dropna(subset=['params_b', PRIMARY_METRIC])
model_best_rag = rag_df.groupby('model_short').agg(
    best_rag_f1=(PRIMARY_METRIC, 'max'),
    mean_rag_f1=(PRIMARY_METRIC, 'mean'),
    params_b=('params_b', 'first'),
    tier=('tier', 'first'),
).reset_index()

# Merge direct and RAG data
scaling = model_direct[['model_short', 'mean_f1', 'params_b', 'tier']].rename(
    columns={'mean_f1': 'direct_f1'}
).merge(model_best_rag[['model_short', 'best_rag_f1', 'mean_rag_f1']], on='model_short', how='outer')
scaling['rag_uplift'] = scaling['best_rag_f1'] - scaling['direct_f1']

fig, ax = plt.subplots(figsize=(11, 6))

# Sort by params for line plot
scaling_sorted = scaling.dropna(subset=['params_b']).sort_values('params_b')

ax.plot(scaling_sorted['params_b'], scaling_sorted['direct_f1'],
        'o-', color='steelblue', markersize=8, label='Direct (no retrieval)', linewidth=2)
ax.plot(scaling_sorted['params_b'], scaling_sorted['best_rag_f1'],
        's-', color='coral', markersize=8, label='Best RAG', linewidth=2)
ax.plot(scaling_sorted['params_b'], scaling_sorted['mean_rag_f1'],
        '^--', color='#ffa726', markersize=7, label='Mean RAG', linewidth=1.5, alpha=0.7)

# Annotate models
for _, row in scaling_sorted.iterrows():
    ax.annotate(row['model_short'], (row['params_b'], row['best_rag_f1']),
                textcoords='offset points', xytext=(8, 6), fontsize=8)

# Shade the RAG uplift region
ax.fill_between(scaling_sorted['params_b'],
                scaling_sorted['direct_f1'], scaling_sorted['best_rag_f1'],
                alpha=0.1, color='coral', label='RAG uplift zone')

ax.set_xscale('log')
ax.set_xlabel('Parameters (Billions, log scale)')
ax.set_ylabel('F1')
ax.set_title('Model Scaling: Direct vs Best RAG')
ax.legend()
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

display(scaling.round(4))

### Multi-Metric Scaling

Do all quality dimensions scale the same way with model size?

In [None]:
# Multi-metric scaling: best RAG performance per model across metrics
scaling_metrics = [m for m in available_metrics if m != 'hallucination']  # hallucination is inverted
rag_df = df[df['exp_type'] == 'rag'].dropna(subset=['params_b'])

if len(scaling_metrics) >= 2 and not rag_df.empty:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Left: Best RAG per model across metrics (absolute values)
    ax = axes[0]
    for metric in scaling_metrics:
        metric_data = rag_df.dropna(subset=[metric])
        if metric_data.empty:
            continue
        model_best = metric_data.groupby('model_short').agg(
            best=(metric, 'max'),
            params_b=('params_b', 'first'),
        ).sort_values('params_b')
        ax.plot(model_best['params_b'], model_best['best'],
                'o-', label=metric, markersize=6)

    ax.set_xscale('log')
    ax.set_xlabel('Parameters (Billions, log scale)')
    ax.set_ylabel('Best Score')
    ax.set_title('Best RAG Performance by Model Size — Multiple Metrics')
    ax.legend(fontsize=9)
    ax.grid(alpha=0.3)

    # Right: RAG uplift (best_rag - direct) per model across metrics
    ax2 = axes[1]
    direct_df = df[df['exp_type'] == 'direct'].dropna(subset=['params_b'])

    for metric in scaling_metrics:
        d_means = direct_df.dropna(subset=[metric]).groupby('model_short').agg(
            direct_mean=(metric, 'mean'), params_b=('params_b', 'first'))
        r_best = rag_df.dropna(subset=[metric]).groupby('model_short').agg(
            best_rag=(metric, 'max'), params_b=('params_b', 'first'))

        merged = d_means.join(r_best[['best_rag']], how='inner')
        merged['uplift'] = merged['best_rag'] - merged['direct_mean']
        merged = merged.sort_values('params_b')

        if not merged.empty:
            ax2.plot(merged['params_b'], merged['uplift'],
                    'o-', label=metric, markersize=6)

    ax2.set_xscale('log')
    ax2.axhline(y=0, color='grey', ls='--', alpha=0.4)
    ax2.set_xlabel('Parameters (Billions, log scale)')
    ax2.set_ylabel('RAG Uplift (best_RAG - direct)')
    ax2.set_title('RAG Uplift by Model Size — Multiple Metrics')
    ax2.legend(fontsize=9)
    ax2.grid(alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("Not enough metrics with data for multi-metric scaling analysis.")

## 3. Can Small+RAG Match Medium+Direct?

For each tier, compare best-RAG performance against the direct performance of the next tier up.

In [None]:
tier_order = ['tiny', 'small', 'medium']
tier_direct = {}
tier_best_rag = {}

for tier in tier_order:
    tier_models = scaling[scaling['tier'] == tier]
    if not tier_models.empty:
        tier_direct[tier] = tier_models['direct_f1'].mean()
        tier_best_rag[tier] = tier_models['best_rag_f1'].max()

comparison_rows = []
for i, tier in enumerate(tier_order[:-1]):
    next_tier = tier_order[i + 1]
    if tier in tier_best_rag and next_tier in tier_direct:
        rag_perf = tier_best_rag[tier]
        next_direct = tier_direct[next_tier]
        ratio = rag_perf / next_direct if next_direct > 0 else np.nan
        comparison_rows.append({
            'tier': tier,
            f'{tier}_direct': tier_direct.get(tier, np.nan),
            f'{tier}_best_rag': rag_perf,
            f'{next_tier}_direct': next_direct,
            'compensation_ratio': ratio,
            'can_match': rag_perf >= next_direct,
        })

comp_df = pd.DataFrame(comparison_rows)
display(comp_df.round(4))

# Visualization
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(tier_order))
w = 0.3

direct_vals = [tier_direct.get(t, 0) for t in tier_order]
rag_vals = [tier_best_rag.get(t, 0) for t in tier_order]

ax.bar(x - w/2, direct_vals, w, label='Direct', color='steelblue', alpha=0.8)
ax.bar(x + w/2, rag_vals, w, label='Best RAG', color='coral', alpha=0.8)

# Draw arrows showing compensation
for i in range(len(tier_order) - 1):
    if rag_vals[i] > 0 and direct_vals[i+1] > 0:
        ax.annotate('', xy=(x[i+1] - w/2, direct_vals[i+1]),
                    xytext=(x[i] + w/2, rag_vals[i]),
                    arrowprops=dict(arrowstyle='->', color='grey', lw=1.5, ls='--'))

ax.set_xticks(x)
ax.set_xticklabels([f"{t.capitalize()}\n({', '.join(scaling[scaling['tier']==t]['model_short'].tolist())})" for t in tier_order],
                    fontsize=9)
ax.set_ylabel('F1')
ax.set_title('Can Small+RAG Match Medium+Direct?')
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 4. What Matters for Small vs Medium Models?

Side-by-side variance decomposition for small and medium models.

In [None]:
tier_bottlenecks = {}
for tier in ['small', 'medium']:
    tier_df = df[(df['tier'] == tier) & (df['exp_type'] == 'rag')]
    if len(tier_df) >= 10:
        bn = identify_bottlenecks(tier_df, PRIMARY_METRIC)
        tier_bottlenecks[tier] = bn
        print(f"\n{tier.capitalize()} models — variance explained:")
        for factor, pct in bn.items():
            print(f"  {factor:<20s}: {pct:5.1f}%")

# Side-by-side bar chart
if len(tier_bottlenecks) >= 2:
    all_factors = sorted(set().union(*[bn.keys() for bn in tier_bottlenecks.values()]))

    fig, ax = plt.subplots(figsize=(10, 5))
    x = np.arange(len(all_factors))
    w = 0.35

    for i, (tier, bn) in enumerate(tier_bottlenecks.items()):
        vals = [bn.get(f, 0) for f in all_factors]
        color = '#ffa726' if tier == 'small' else '#66bb6a'
        ax.bar(x + (i - 0.5) * w, vals, w, label=tier.capitalize(), color=color, alpha=0.8)

    ax.set_xticks(x)
    ax.set_xticklabels(all_factors, rotation=30, ha='right')
    ax.set_ylabel('Variance Explained (%)')
    ax.set_title('What Matters: Small vs Medium Models')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

# Also do tiny if available
tiny_df = df[(df['tier'] == 'tiny') & (df['exp_type'] == 'rag')]
if len(tiny_df) >= 10:
    bn = identify_bottlenecks(tiny_df, PRIMARY_METRIC)
    print(f"\nTiny models — variance explained:")
    for factor, pct in bn.items():
        print(f"  {factor:<20s}: {pct:5.1f}%")

## 5. Thesis Verdict

In [None]:
# Summary table: tier, direct_f1, best_rag_f1, uplift, can_match_next_tier
verdict_rows = []
for tier in tier_order:
    tier_models = scaling[scaling['tier'] == tier]
    if tier_models.empty:
        continue

    direct_mean = tier_models['direct_f1'].mean()
    best_rag = tier_models['best_rag_f1'].max()
    uplift = best_rag - direct_mean

    # Can this tier's best RAG match the next tier's direct?
    tier_idx = tier_order.index(tier)
    if tier_idx < len(tier_order) - 1:
        next_tier = tier_order[tier_idx + 1]
        next_direct = tier_direct.get(next_tier, np.nan)
        can_match = best_rag >= next_direct if not np.isnan(next_direct) else None
    else:
        can_match = 'N/A (largest tier)'

    verdict_rows.append({
        'tier': tier,
        'models': ', '.join(tier_models['model_short'].tolist()),
        'direct_f1': direct_mean,
        'best_rag_f1': best_rag,
        'rag_uplift': uplift,
        'can_match_next_tier': can_match,
    })

verdict_df = pd.DataFrame(verdict_rows)
print("THESIS VERDICT: Model Scaling Summary")
print("=" * 70)
display(verdict_df.round(4))

In [None]:
rag_with_params = df[(df['exp_type'] == 'rag')].dropna(subset=['params_b']).copy()

groundedness_available = [m for m in ['faithfulness', 'hallucination', 'answer_in_context', 'context_recall']
                          if m in rag_with_params.columns and rag_with_params[m].notna().sum() >= 10]

if groundedness_available:
    n_metrics = len(groundedness_available)
    fig, axes = plt.subplots(1, n_metrics, figsize=(6 * n_metrics, 5))
    if n_metrics == 1:
        axes = [axes]

    for ax, metric in zip(axes, groundedness_available):
        metric_data = rag_with_params.dropna(subset=[metric])
        model_stats = metric_data.groupby('model_short').agg(
            mean=(metric, 'mean'),
            std=(metric, 'std'),
            n=(metric, 'count'),
            params_b=('params_b', 'first'),
            tier=('tier', 'first'),
        ).sort_values('params_b')

        for tier, color in tier_colors.items():
            sub = model_stats[model_stats['tier'] == tier]
            ci = 1.96 * sub['std'] / np.sqrt(sub['n'])
            ax.errorbar(sub['params_b'], sub['mean'], yerr=ci,
                       fmt='o', color=color, markersize=8, capsize=4,
                       label=tier.capitalize())
            for name, row in sub.iterrows():
                ax.annotate(name, (row['params_b'], row['mean']),
                           textcoords='offset points', xytext=(6, 4), fontsize=8)

        # Trend line
        if len(model_stats) >= 3:
            log_p = np.log10(model_stats['params_b'])
            slope, intercept, r, p, _ = scipy_stats.linregress(log_p, model_stats['mean'])
            x_fit = np.linspace(log_p.min() - 0.1, log_p.max() + 0.1, 50)
            ax.plot(10**x_fit, slope * x_fit + intercept, '--', color='grey', alpha=0.5)
            ax.set_title(f'{metric}\n(slope={slope:.3f}, R²={r**2:.2f})')
        else:
            ax.set_title(metric)

        ax.set_xscale('log')
        ax.set_xlabel('Parameters (B)')
        ax.set_ylabel(f'Mean {metric}')
        ax.legend(fontsize=8)
        ax.grid(alpha=0.3)

    plt.suptitle('Groundedness Metrics by Model Size', y=1.02, fontsize=14)
    plt.tight_layout()
    plt.show()

    # Summary table
    print("\nGroundedness by Model Tier (mean across RAG experiments):")
    print("=" * 70)
    for metric in groundedness_available:
        tier_means = rag_with_params.groupby('tier')[metric].agg(['mean', 'count'])
        print(f"\n  {metric}:")
        for tier in ['tiny', 'small', 'medium']:
            if tier in tier_means.index:
                print(f"    {tier:<8s}: {tier_means.loc[tier, 'mean']:.3f} "
                      f"(n={int(tier_means.loc[tier, 'count'])})")

    # Key insight: does faithfulness scale differently from F1?
    if 'faithfulness' in groundedness_available and PRIMARY_METRIC in rag_with_params.columns:
        print("\n  F1 vs Faithfulness scaling gap:")
        for tier in ['tiny', 'small', 'medium']:
            t_data = rag_with_params[rag_with_params['tier'] == tier]
            if len(t_data) >= 5:
                f1_mean = t_data[PRIMARY_METRIC].mean()
                faith_mean = t_data['faithfulness'].mean()
                print(f"    {tier:<8s}: F1={f1_mean:.3f}, faith={faith_mean:.3f}, "
                      f"gap={faith_mean - f1_mean:+.3f}")
else:
    print("No groundedness metrics available. Run:")
    print("  uv run ragicamp compute-metrics outputs/smart_retrieval_slm "
          "-m faithfulness,hallucination,answer_in_context,context_recall")

## 5b. Faithfulness & Hallucination by Model Size

**Key safety question:** Do bigger models hallucinate less? Are smaller models less faithful to context?

This matters for deployment: if small models are equally faithful when given good context,
they may be safer than their F1 gap suggests.

## 6. Summary

Key takeaways:
- Direct performance scales with model size (log-linear relationship)
- RAG uplift magnitude vs model size
- Whether small+premium RAG can match medium+direct
- Different components matter for small vs medium models