# Smart Retrieval Analysis: What Improves RAG Performance?

This notebook analyzes the Smart Retrieval SLM study results.

## Experiment Groups
- **Group A**: Embedding Model Comparison (BGE-large vs BGE-M3)
- **Group D**: Reranking (overfetch + rerank)
- **Group E**: Query Transformation (HyDE, MultiQuery)
- **Group F**: Advanced Agents (Iterative RAG, Self-RAG)

## Key Questions
1. Does RAG improve over Direct LLM?
2. Which embedding model performs better?
3. Does reranking help? At what fetch_k?
4. Do query transformations (HyDE, MultiQuery) improve retrieval?
5. Do advanced agents outperform simple RAG?

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

STUDY_PATH = Path("../outputs/smart_retrieval_slm")

## 1. Load Results

In [None]:
def parse_experiment_name(name: str) -> Dict[str, Any]:
    """Parse experiment name to extract configuration.
    
    Naming convention: {group}{num}_{description}_{dataset}
    Examples:
        - direct_vllmmetallama_Llama3.23BInstruct_concise_nq
        - a1_bge_large_baseline_nq
        - d1_rerank_bge_top3_hotpotqa
        - f1_iterative_1round_triviaqa
    """
    config = {
        'name': name,
        'group': None,
        'group_name': None,
        'exp_type': 'rag',
        'hypothesis': None,
        'dataset': None,
        'embedding_model': None,
        'reranker': None,
        'query_transform': None,
        'agent_type': 'fixed_rag',
        'top_k': None,
        'fetch_k': None,
    }
    
    # Detect direct LLM baseline
    if name.startswith('direct_'):
        config['exp_type'] = 'direct'
        config['group'] = 'baseline'
        config['group_name'] = 'Direct LLM'
    
    # Parse group from prefix
    if name.startswith('a1_') or name.startswith('a2_'):
        config['group'] = 'A'
        config['group_name'] = 'Embedding Model'
        if 'bge_large' in name:
            config['embedding_model'] = 'BGE-large'
        elif 'bge_m3' in name:
            config['embedding_model'] = 'BGE-M3'
    
    elif name.startswith('d1_') or name.startswith('d2_'):
        config['group'] = 'D'
        config['group_name'] = 'Reranking'
        config['reranker'] = 'BGE-reranker'
        if 'top3' in name:
            config['top_k'] = 3
            config['fetch_k'] = 20
        elif 'top5' in name:
            config['top_k'] = 5
            config['fetch_k'] = 25
    
    elif name.startswith('e1_') or name.startswith('e2_') or name.startswith('e3_'):
        config['group'] = 'E'
        config['group_name'] = 'Query Transform'
        if 'hyde' in name:
            config['query_transform'] = 'HyDE'
        elif 'multiquery' in name:
            config['query_transform'] = 'MultiQuery'
        if 'rerank' in name:
            config['reranker'] = 'BGE-reranker'
    
    elif name.startswith('f1_') or name.startswith('f2_') or name.startswith('f3_') or name.startswith('f4_'):
        config['group'] = 'F'
        config['group_name'] = 'Advanced Agents'
        if 'iterative' in name:
            config['agent_type'] = 'iterative_rag'
            if '1round' in name:
                config['hypothesis'] = 'Iterative (1 round)'
            elif '2round' in name:
                config['hypothesis'] = 'Iterative (2 rounds)'
        elif 'selfrag' in name:
            config['agent_type'] = 'self_rag'
            if 'verified' in name:
                config['hypothesis'] = 'Self-RAG (verified)'
            else:
                config['hypothesis'] = 'Self-RAG (balanced)'
    
    # Extract dataset
    if '_nq' in name or name.endswith('_nq'):
        config['dataset'] = 'NQ'
    elif '_hotpotqa' in name or name.endswith('_hotpotqa'):
        config['dataset'] = 'HotpotQA'
    elif '_triviaqa' in name or name.endswith('_triviaqa'):
        config['dataset'] = 'TriviaQA'
    
    return config


def load_all_results(study_path: Path) -> pd.DataFrame:
    """Load all experiment results into a DataFrame."""
    results = []
    
    for exp_dir in study_path.iterdir():
        if not exp_dir.is_dir():
            continue
        
        # Try results.json first, then predictions.json
        results_file = exp_dir / "results.json"
        predictions_file = exp_dir / "predictions.json"
        
        if results_file.exists():
            with open(results_file) as f:
                data = json.load(f)
        elif predictions_file.exists():
            with open(predictions_file) as f:
                data = json.load(f)
        else:
            continue
        
        try:
            # Parse experiment name
            exp_name = data.get('name', exp_dir.name)
            config = parse_experiment_name(exp_name)
            
            # Add metrics
            metrics = data.get('metrics', data.get('aggregate_metrics', {}))
            config.update({
                'f1': metrics.get('f1'),
                'exact_match': metrics.get('exact_match'),
                'bertscore_f1': metrics.get('bertscore_f1'),
                'bleurt': metrics.get('bleurt'),
                'num_predictions': len(data.get('predictions', [])),
            })
            
            results.append(config)
        except Exception as e:
            print(f"Error loading {exp_dir.name}: {e}")
    
    return pd.DataFrame(results)

In [None]:
# Load results
df = load_all_results(STUDY_PATH)
print(f"Loaded {len(df)} experiments")
print(f"\nExperiments by group:")
print(df.groupby('group_name').size())
print(f"\nExperiments by dataset:")
print(df.groupby('dataset').size())
df.head(10)

## 2. Overall Performance Summary

In [None]:
# Summary statistics by group
metrics = ['f1', 'exact_match', 'bertscore_f1', 'bleurt']
summary = df.groupby('group_name')[metrics].agg(['mean', 'std', 'max']).round(3)
summary

In [None]:
# Plot: F1 by Group
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot by group
df_plot = df[df['group_name'].notna()].copy()
order = ['Direct LLM', 'Embedding Model', 'Reranking', 'Query Transform', 'Advanced Agents']
order = [g for g in order if g in df_plot['group_name'].unique()]

sns.boxplot(data=df_plot, x='group_name', y='f1', order=order, ax=axes[0])
axes[0].set_title('F1 Score by Experiment Group')
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', rotation=45)

# Box plot by dataset
sns.boxplot(data=df_plot, x='dataset', y='f1', ax=axes[1])
axes[1].set_title('F1 Score by Dataset')
axes[1].set_xlabel('')

plt.tight_layout()
plt.show()

## 3. RAG vs Direct LLM Comparison

In [None]:
# Get baseline (direct LLM) performance
direct_df = df[df['exp_type'] == 'direct'].copy()
rag_df = df[df['exp_type'] == 'rag'].copy()

print("Direct LLM Baselines:")
print(direct_df[['name', 'dataset', 'f1', 'exact_match', 'bertscore_f1']].to_string())

# Create baseline lookup
baseline_lookup = direct_df.groupby('dataset')['f1'].mean().to_dict()
print(f"\nBaseline F1 by dataset: {baseline_lookup}")

In [None]:
# Calculate improvement over baseline
rag_df = rag_df.copy()
rag_df['baseline_f1'] = rag_df['dataset'].map(baseline_lookup)
rag_df['f1_improvement'] = rag_df['f1'] - rag_df['baseline_f1']
rag_df['f1_improvement_pct'] = (rag_df['f1_improvement'] / rag_df['baseline_f1'] * 100).round(1)

# Summary
print(f"RAG experiments that improve over baseline: {(rag_df['f1_improvement'] > 0).sum()} / {len(rag_df)}")
print(f"Average F1 improvement: {rag_df['f1_improvement'].mean():.3f} ({rag_df['f1_improvement_pct'].mean():.1f}%)")

In [None]:
# Plot: Improvement distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(rag_df['f1_improvement'].dropna(), bins=20, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--', label='Baseline')
axes[0].set_xlabel('F1 Improvement over Direct LLM')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of RAG Improvement')
axes[0].legend()

# By group
improvement_by_group = rag_df.groupby('group_name')['f1_improvement'].mean().sort_values(ascending=False)
improvement_by_group.plot(kind='bar', ax=axes[1], color='steelblue', edgecolor='black')
axes[1].axhline(y=0, color='red', linestyle='--')
axes[1].set_xlabel('')
axes[1].set_ylabel('Average F1 Improvement')
axes[1].set_title('F1 Improvement by Experiment Group')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Component Analysis

### 4.1 Embedding Model Comparison (Group A)

In [None]:
# Filter to Group A experiments
group_a = rag_df[rag_df['group'] == 'A'].copy()

if len(group_a) > 0:
    print("Embedding Model Comparison:")
    comparison = group_a.groupby(['embedding_model', 'dataset'])[metrics].mean().round(3)
    print(comparison)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 5))
    group_a.pivot_table(values='f1', index='dataset', columns='embedding_model').plot(kind='bar', ax=ax)
    ax.set_title('F1 by Embedding Model and Dataset')
    ax.set_ylabel('F1 Score')
    ax.legend(title='Embedding Model')
    plt.tight_layout()
    plt.show()
else:
    print("No Group A experiments found")

### 4.2 Reranking Impact (Group D)

In [None]:
# Filter to Group D experiments
group_d = rag_df[rag_df['group'] == 'D'].copy()

if len(group_d) > 0:
    print("Reranking Impact:")
    comparison = group_d.groupby(['top_k', 'fetch_k', 'dataset'])[metrics].mean().round(3)
    print(comparison)
    
    # Compare with Group A baseline (no reranking)
    if len(group_a) > 0:
        baseline_f1 = group_a.groupby('dataset')['f1'].mean()
        rerank_f1 = group_d.groupby('dataset')['f1'].mean()
        
        print("\nReranking improvement over baseline:")
        for ds in rerank_f1.index:
            if ds in baseline_f1.index:
                diff = rerank_f1[ds] - baseline_f1[ds]
                print(f"  {ds}: {diff:+.3f} ({diff/baseline_f1[ds]*100:+.1f}%)")
else:
    print("No Group D experiments found")

### 4.3 Query Transformation Impact (Group E)

In [None]:
# Filter to Group E experiments
group_e = rag_df[rag_df['group'] == 'E'].copy()

if len(group_e) > 0:
    print("Query Transformation Impact:")
    comparison = group_e.groupby(['query_transform', 'dataset'])[metrics].mean().round(3)
    print(comparison)
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 5))
    group_e.pivot_table(values='f1', index='dataset', columns='query_transform').plot(kind='bar', ax=ax)
    ax.set_title('F1 by Query Transformation and Dataset')
    ax.set_ylabel('F1 Score')
    ax.legend(title='Query Transform')
    plt.tight_layout()
    plt.show()
else:
    print("No Group E experiments found")

### 4.4 Advanced Agents (Group F)

In [None]:
# Filter to Group F experiments
group_f = rag_df[rag_df['group'] == 'F'].copy()

if len(group_f) > 0:
    print("Advanced Agents Impact:")
    comparison = group_f.groupby(['hypothesis', 'dataset'])[metrics].mean().round(3)
    print(comparison)
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 5))
    group_f.pivot_table(values='f1', index='dataset', columns='hypothesis').plot(kind='bar', ax=ax)
    ax.set_title('F1 by Agent Type and Dataset')
    ax.set_ylabel('F1 Score')
    ax.legend(title='Agent Type', bbox_to_anchor=(1.02, 1))
    plt.tight_layout()
    plt.show()
else:
    print("No Group F experiments found")

## 5. Best and Worst Configurations

In [None]:
# Top 10 best RAG configurations
print("Top 10 Best RAG Configurations by F1:")
top_10 = rag_df.nlargest(10, 'f1')[['name', 'group_name', 'dataset', 'f1', 'f1_improvement_pct']]
print(top_10.to_string())

In [None]:
# Bottom 10 worst RAG configurations
print("Bottom 10 Worst RAG Configurations by F1:")
bottom_10 = rag_df.nsmallest(10, 'f1')[['name', 'group_name', 'dataset', 'f1', 'f1_improvement_pct']]
print(bottom_10.to_string())

In [None]:
# Best configuration per dataset
print("Best RAG Configuration per Dataset:")
best_per_dataset = rag_df.loc[rag_df.groupby('dataset')['f1'].idxmax()]
print(best_per_dataset[['name', 'dataset', 'group_name', 'f1', 'f1_improvement_pct']].to_string())

## 6. Cross-Metric Correlation

In [None]:
# Correlation between metrics
metric_cols = ['f1', 'exact_match', 'bertscore_f1', 'bleurt']
available_metrics = [m for m in metric_cols if m in df.columns and df[m].notna().any()]

if len(available_metrics) >= 2:
    corr = df[available_metrics].corr().round(2)
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, ax=ax)
    ax.set_title('Metric Correlations')
    plt.tight_layout()
    plt.show()
else:
    print(f"Only {len(available_metrics)} metrics available, need at least 2 for correlation")

## 7. Summary & Conclusions

In [None]:
print("="*60)
print("SUMMARY: What Improves RAG Performance?")
print("="*60)

# RAG vs Direct
if len(rag_df) > 0 and 'f1_improvement' in rag_df.columns:
    rag_helps_pct = (rag_df['f1_improvement'] > 0).mean() * 100
    avg_improvement = rag_df['f1_improvement'].mean()
    print(f"\n1. RAG vs Direct LLM:")
    print(f"   - RAG improves over baseline in {rag_helps_pct:.0f}% of experiments")
    print(f"   - Average improvement: {avg_improvement:+.3f} F1")

# Best group
if len(rag_df) > 0:
    group_means = rag_df.groupby('group_name')['f1'].mean().sort_values(ascending=False)
    print(f"\n2. Best Performing Groups (by avg F1):")
    for group, f1 in group_means.items():
        print(f"   - {group}: {f1:.3f}")

# Best overall config
if len(rag_df) > 0:
    best = rag_df.loc[rag_df['f1'].idxmax()]
    print(f"\n3. Best Overall Configuration:")
    print(f"   - {best['name']}")
    print(f"   - F1: {best['f1']:.3f}")

print("\n" + "="*60)