# NB09: Efficiency-Adjusted Performance

**Question:** Is the best RAG config worth the extra latency/cost?

This notebook computes cost-adjusted metrics:
1. **Pareto frontier** — configs not dominated in (F1, latency) space
2. **Cost per correct answer** — F1-adjusted throughput
3. **Agent complexity vs improvement** — do extra steps pay off?
4. **Practical recommendations** — best configs at different latency budgets

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from analysis_utils import (
    load_all_results, setup_plotting, weighted_mean_with_ci,
    PRIMARY_METRIC, BROKEN_MODELS, MODEL_TIER, MODEL_PARAMS,
)

setup_plotting()
STUDY_PATH = Path("../outputs/smart_retrieval_slm")

df_all = load_all_results(STUDY_PATH)
df = df_all[~df_all['model_short'].isin(BROKEN_MODELS)].copy()
df['tier'] = df['model_short'].map(MODEL_TIER)
df['params_b'] = df['model_short'].map(MODEL_PARAMS)
print(f"Loaded {len(df)} experiments")
print(f"Duration available: {df['duration'].notna().sum()}")
print(f"Throughput available: {df['throughput'].notna().sum()}")

In [None]:
# Compute derived efficiency metrics
df['has_timing'] = df['duration'].notna() & (df['duration'] > 0)
eff = df[df['has_timing'] & df[PRIMARY_METRIC].notna()].copy()

# Questions per second (throughput)
eff['qps'] = np.where(eff['throughput'] > 0, eff['throughput'],
                       np.where(eff['duration'] > 0,
                                eff['n_samples'] / eff['duration'], np.nan))

# Seconds per question
eff['sec_per_q'] = np.where(eff['qps'] > 0, 1.0 / eff['qps'], np.nan)

# "Correct answers per second" — F1 * qps
eff['correct_per_sec'] = eff[PRIMARY_METRIC] * eff['qps']

print(f"Experiments with timing data: {len(eff)}")
if not eff.empty:
    print(f"Throughput range: {eff['qps'].min():.2f} - {eff['qps'].max():.2f} q/s")
    print(f"Latency range: {eff['sec_per_q'].min():.2f} - {eff['sec_per_q'].max():.2f} s/q")

## 1. Pareto Frontier

Which configurations are not dominated in (F1, latency) space? A config is Pareto-optimal if no other config has both higher F1 and lower latency.

In [None]:
def compute_pareto_frontier(df, x_col, y_col, minimize_x=True):
    """Compute Pareto frontier points.

    Args:
        df: DataFrame
        x_col: column to minimize (e.g. latency)
        y_col: column to maximize (e.g. F1)
        minimize_x: if True, lower x is better
    """
    data = df[[x_col, y_col]].dropna()
    if minimize_x:
        data = data.sort_values(x_col)
    else:
        data = data.sort_values(x_col, ascending=False)

    pareto_idx = []
    best_y = -np.inf
    for idx, row in data.iterrows():
        if row[y_col] > best_y:
            pareto_idx.append(idx)
            best_y = row[y_col]

    return df.loc[pareto_idx]


if not eff.empty and eff['sec_per_q'].notna().sum() >= 5:
    pareto = compute_pareto_frontier(eff, 'sec_per_q', PRIMARY_METRIC, minimize_x=True)
    print(f"Pareto-optimal configs: {len(pareto)} / {len(eff)}")

    fig, ax = plt.subplots(figsize=(12, 7))

    # All experiments (background)
    type_colors = {'direct': 'steelblue', 'rag': 'lightcoral'}
    for exp_type in ['direct', 'rag']:
        sub = eff[eff['exp_type'] == exp_type]
        ax.scatter(sub['sec_per_q'], sub[PRIMARY_METRIC],
                   s=20, alpha=0.3, color=type_colors.get(exp_type, 'grey'),
                   label=f'{exp_type} (all)')

    # Pareto frontier
    pareto_sorted = pareto.sort_values('sec_per_q')
    ax.plot(pareto_sorted['sec_per_q'], pareto_sorted[PRIMARY_METRIC],
            'k-o', markersize=6, linewidth=2, label='Pareto frontier', zorder=5)

    # Annotate Pareto points
    for _, row in pareto_sorted.iterrows():
        label = f"{row.get('model_short', '')[:8]}\n{row.get('agent_type', '')}"  
        ax.annotate(label, (row['sec_per_q'], row[PRIMARY_METRIC]),
                    textcoords='offset points', xytext=(8, 4), fontsize=7)

    ax.set_xlabel('Latency (seconds per question)')
    ax.set_ylabel('F1')
    ax.set_title('Pareto Frontier: F1 vs Latency')
    ax.legend()
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Table of Pareto-optimal configs
    display_cols = ['model_short', 'dataset', 'agent_type', 'retriever_type',
                    'reranker', PRIMARY_METRIC, 'sec_per_q', 'qps']
    display_cols = [c for c in display_cols if c in pareto.columns]
    display(pareto_sorted[display_cols].round(4))
else:
    print("Insufficient timing data for Pareto analysis.")

## 2. Cost Per Correct Answer

In [None]:
if not eff.empty:
    # F1-weighted throughput by exp_type and agent_type
    for group_col in ['exp_type', 'agent_type']:
        if group_col not in eff.columns:
            continue
        stats = eff.groupby(group_col).agg(
            mean_f1=(PRIMARY_METRIC, 'mean'),
            mean_qps=('qps', 'mean'),
            mean_sec_per_q=('sec_per_q', 'mean'),
            mean_correct_per_sec=('correct_per_sec', 'mean'),
            n=('qps', 'count'),
        ).round(4)
        print(f"\nEfficiency by {group_col}:")
        display(stats)

    # Scatter: F1 vs correct_per_sec colored by agent_type
    fig, ax = plt.subplots(figsize=(10, 6))
    agent_colors = {'direct_llm': 'steelblue', 'fixed_rag': 'coral',
                    'iterative_rag': '#66bb6a', 'self_rag': '#ffa726'}
    for agent in sorted(eff['agent_type'].unique()):
        sub = eff[eff['agent_type'] == agent]
        ax.scatter(sub[PRIMARY_METRIC], sub['correct_per_sec'],
                   s=30, alpha=0.5, color=agent_colors.get(agent, 'grey'),
                   label=agent)
    ax.set_xlabel('F1')
    ax.set_ylabel('Correct Answers per Second (F1 * QPS)')
    ax.set_title('Quality vs Efficiency by Agent Type')
    ax.legend()
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

## 3. Agent Complexity vs Improvement

Do iterative_rag and self_rag justify their extra latency with proportional F1 gains?

In [None]:
if not eff.empty:
    rag_eff = eff[eff['exp_type'] == 'rag'].copy()

    if not rag_eff.empty and rag_eff['agent_type'].nunique() > 1:
        # Per model+dataset: compare fixed_rag baseline with advanced agents
        group_cols = ['model_short', 'dataset']
        rows = []

        for group_vals, group_df in rag_eff.groupby(group_cols):
            model, dataset = group_vals
            fixed = group_df[group_df['agent_type'] == 'fixed_rag']
            if fixed.empty or fixed[PRIMARY_METRIC].notna().sum() == 0:
                continue

            fixed_best_f1 = fixed[PRIMARY_METRIC].max()
            fixed_mean_latency = fixed['sec_per_q'].mean()

            for agent in ['iterative_rag', 'self_rag']:
                advanced = group_df[group_df['agent_type'] == agent]
                if advanced.empty or advanced[PRIMARY_METRIC].notna().sum() == 0:
                    continue

                adv_best_f1 = advanced[PRIMARY_METRIC].max()
                adv_mean_latency = advanced['sec_per_q'].mean()

                rows.append({
                    'model': model, 'dataset': dataset,
                    'agent': agent,
                    'fixed_f1': fixed_best_f1,
                    'advanced_f1': adv_best_f1,
                    'f1_delta': adv_best_f1 - fixed_best_f1,
                    'fixed_latency': fixed_mean_latency,
                    'advanced_latency': adv_mean_latency,
                    'latency_ratio': adv_mean_latency / fixed_mean_latency if fixed_mean_latency > 0 else np.nan,
                })

        cost_df = pd.DataFrame(rows)
        if not cost_df.empty:
            print("Agent Complexity Cost-Benefit:")
            display(cost_df.round(4))

            # Scatter: F1 delta vs latency ratio
            fig, ax = plt.subplots(figsize=(10, 6))
            for agent in cost_df['agent'].unique():
                sub = cost_df[cost_df['agent'] == agent]
                color = '#66bb6a' if agent == 'iterative_rag' else '#ffa726'
                ax.scatter(sub['latency_ratio'], sub['f1_delta'],
                           s=80, color=color, label=agent, edgecolors='black', alpha=0.7)
                for _, row in sub.iterrows():
                    ax.annotate(f"{row['model'][:6]}\n{row['dataset']}",
                                (row['latency_ratio'], row['f1_delta']),
                                textcoords='offset points', xytext=(5, 3), fontsize=7)

            ax.axhline(y=0, color='red', linestyle='--', alpha=0.5, label='Break-even (F1)')
            ax.axvline(x=1, color='grey', linestyle='--', alpha=0.5, label='Same latency')
            ax.set_xlabel('Latency Ratio (advanced / fixed_rag)')
            ax.set_ylabel('F1 Delta (advanced - fixed_rag)')
            ax.set_title('Agent Complexity: Is Extra Latency Worth It?')
            ax.legend()
            ax.grid(alpha=0.3)
            plt.tight_layout()
            plt.show()

            # Summary
            for agent in cost_df['agent'].unique():
                sub = cost_df[cost_df['agent'] == agent]
                n_better = (sub['f1_delta'] > 0).sum()
                avg_ratio = sub['latency_ratio'].mean()
                print(f"\n{agent}:")
                print(f"  Better than fixed_rag in {n_better}/{len(sub)} scenarios")
                print(f"  Average latency ratio: {avg_ratio:.2f}x")
                print(f"  Mean F1 delta: {sub['f1_delta'].mean():+.4f}")

## 4. Practical Recommendations at Latency Budgets

In [None]:
if not eff.empty:
    # Best F1 at different latency budgets
    latency_budgets = [0.5, 1.0, 2.0, 5.0, 10.0]  # seconds per question

    print("Best Configurations at Different Latency Budgets:")
    print("=" * 70)

    config_cols = ['model_short', 'agent_type', 'retriever_type', 'reranker',
                   'prompt', PRIMARY_METRIC, 'sec_per_q']
    config_cols = [c for c in config_cols if c in eff.columns]

    budget_rows = []
    for budget in latency_budgets:
        within_budget = eff[eff['sec_per_q'] <= budget]
        if within_budget.empty or within_budget[PRIMARY_METRIC].notna().sum() == 0:
            continue

        best_idx = within_budget[PRIMARY_METRIC].idxmax()
        best = within_budget.loc[best_idx]

        budget_rows.append({
            'budget_sec': budget,
            'best_f1': best[PRIMARY_METRIC],
            'model': best.get('model_short', 'N/A'),
            'agent': best.get('agent_type', 'N/A'),
            'retriever': best.get('retriever_type', 'N/A'),
            'actual_latency': best.get('sec_per_q', np.nan),
            'n_options': len(within_budget),
        })

    budget_df = pd.DataFrame(budget_rows)
    if not budget_df.empty:
        display(budget_df.round(4))

        # Plot: latency budget vs best achievable F1
        fig, ax = plt.subplots(figsize=(8, 5))
        ax.plot(budget_df['budget_sec'], budget_df['best_f1'],
                'o-', color='steelblue', markersize=8, linewidth=2)
        for _, row in budget_df.iterrows():
            ax.annotate(f"{row['model']}\n{row['agent']}",
                        (row['budget_sec'], row['best_f1']),
                        textcoords='offset points', xytext=(8, -8), fontsize=8)
        ax.set_xlabel('Latency Budget (seconds per question)')
        ax.set_ylabel('Best Achievable F1')
        ax.set_title('F1 vs Latency Budget: Practical Frontier')
        ax.grid(alpha=0.3)
        plt.tight_layout()
        plt.show()
    else:
        print("No configs within specified latency budgets.")

## 5. Summary

Key findings:
- Pareto-optimal configurations in (F1, latency) space
- Whether direct_llm offers better efficiency-adjusted performance
- Whether advanced agents (iterative/self RAG) justify their latency overhead
- Practical recommendations at different latency budgets