# Smart Retrieval SLM Analysis

## Hypothesis
**"High-quality retrieval can compensate for smaller, faster LLMs."**

This notebook analyzes experiment results with proper handling of:
- **Stratified sampling** (non-uniform experiment counts)
- **Component-wise effect estimation** with confidence intervals
- **Bottleneck identification** for maximizing QA performance

### Analysis Dimensions
| Dimension | Values | Description |
|-----------|--------|-------------|
| Model | Llama-3.2-3B, Phi-3-mini, Qwen-2.5-3B | Generator LLM |
| Retriever Type | dense, hybrid, hierarchical | Retrieval strategy |
| Embedding Model | bge-large, bge-m3, gte-qwen2, e5-mistral | Embedding model |
| Query Transform | none, hyde, multiquery | Query preprocessing |
| Reranker | none, bge, bge-v2 | Cross-encoder reranking |
| Prompt | concise, structured, cot, fewshot_3 | Prompt template |
| Top-K | 3, 5, 10 | Retrieved documents |
| Dataset | nq, triviaqa, hotpotqa | Evaluation benchmark |

In [None]:
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from scipy import stats
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Paths
STUDY_PATH = Path("../outputs/smart_retrieval_slm")

# Metrics to analyze
METRICS = ['f1', 'exact_match', 'bertscore', 'bleurt', 'llm_judge']
PRIMARY_METRIC = 'f1'

print(f"Study path: {STUDY_PATH}")
print(f"Exists: {STUDY_PATH.exists()}")

## 1. Data Loading & Parsing

Parse experiment names following the naming convention:
- Direct: `direct_{model}_{prompt}_{dataset}`
- RAG: `rag_{model}_{retriever}_k{top_k}_{query_transform?}_{reranker?}_{prompt}_{dataset}`

In [None]:
# Model name mappings
MODEL_MAP = {
    'llama': 'Llama-3.2-3B',
    'Llama3.23BInstruct': 'Llama-3.2-3B',
    'phi': 'Phi-3-mini',
    'Phi3mini4kinstruct': 'Phi-3-mini',
    'qwen': 'Qwen-2.5-3B',
    'Qwen2.53BInstruct': 'Qwen-2.5-3B',
}

# Retriever type detection
RETRIEVER_TYPES = {
    'dense': ['dense_bge', 'dense_gte', 'dense_e5', 'en_bge', 'en_gte', 'en_e5'],
    'hybrid': ['hybrid_'],
    'hierarchical': ['hier_', 'hierarchical_'],
}

# Embedding model detection
EMBEDDING_MAP = {
    'bge_large': 'BGE-large',
    'bge_m3': 'BGE-M3',
    'gte_qwen2': 'GTE-Qwen2-1.5B',
    'e5_mistral': 'E5-Mistral-7B',
}


def parse_experiment_name(name: str) -> Dict[str, Any]:
    """Parse experiment name into structured components.
    
    Handles formats:
    - direct_vllm_metallamaLlama3.23BInstruct_concise_nq
    - rag_vllm_metallamaLlama3.23BInstruct_dense_bge_large_512_k5_hyde_bge_concise_nq
    - Singleton experiments (iterative_*, selfrag_*, premium_*)
    """
    config = {
        'name': name,
        'exp_type': 'unknown',
        'model': 'unknown',
        'model_short': 'unknown',
        'dataset': 'unknown',
        'prompt': 'unknown',
        'retriever': None,
        'retriever_type': None,
        'embedding_model': None,
        'top_k': None,
        'query_transform': 'none',
        'reranker': 'none',
        'is_singleton': False,
    }
    
    # Detect dataset (always at end)
    for ds in ['nq', 'triviaqa', 'hotpotqa']:
        if name.endswith(f'_{ds}'):
            config['dataset'] = ds
            break
    
    # Handle singleton experiments
    if name.startswith('iterative_') or name.startswith('selfrag_') or name.startswith('premium_'):
        config['is_singleton'] = True
        config['exp_type'] = 'rag'
        if 'llama' in name.lower():
            config['model_short'] = 'Llama-3.2-3B'
        elif 'phi' in name.lower():
            config['model_short'] = 'Phi-3-mini'
        elif 'qwen' in name.lower():
            config['model_short'] = 'Qwen-2.5-3B'
        
        if name.startswith('iterative_'):
            config['retriever_type'] = 'iterative'
            # Parse iterations
            iter_match = re.search(r'(\d+)iter', name)
            config['query_transform'] = f"iterative_{iter_match.group(1)}" if iter_match else 'iterative'
        elif name.startswith('selfrag_'):
            config['retriever_type'] = 'self_rag'
            config['query_transform'] = 'self_rag'
        elif name.startswith('premium_'):
            config['retriever_type'] = 'hybrid'
            config['query_transform'] = 'hyde'
            config['reranker'] = 'bge-v2'
        return config
    
    # Direct experiments
    if name.startswith('direct_'):
        config['exp_type'] = 'direct'
        # Parse model from name
        for key, display in MODEL_MAP.items():
            if key.lower() in name.lower():
                config['model_short'] = display
                break
        # Parse prompt (before dataset)
        for prompt in ['concise', 'structured', 'cot', 'fewshot_3', 'fewshot', 'extractive', 'cited']:
            if f'_{prompt}_' in name or name.endswith(f'_{prompt}_{config["dataset"]}'):
                config['prompt'] = prompt
                break
        return config
    
    # RAG experiments
    if name.startswith('rag_'):
        config['exp_type'] = 'rag'
        
        # Parse model
        for key, display in MODEL_MAP.items():
            if key.lower() in name.lower():
                config['model_short'] = display
                break
        
        # Parse top_k
        k_match = re.search(r'_k(\d+)_', name)
        if k_match:
            config['top_k'] = int(k_match.group(1))
        
        # Parse retriever type and embedding model
        for rtype, patterns in RETRIEVER_TYPES.items():
            for pattern in patterns:
                if pattern in name.lower():
                    config['retriever_type'] = rtype
                    break
        
        for key, display in EMBEDDING_MAP.items():
            if key in name.lower():
                config['embedding_model'] = display
                break
        
        # Parse query transform
        if '_hyde_' in name.lower():
            config['query_transform'] = 'hyde'
        elif '_multiquery_' in name.lower():
            config['query_transform'] = 'multiquery'
        
        # Parse reranker
        if '_bgev2_' in name.lower() or '_bge-v2_' in name.lower():
            config['reranker'] = 'bge-v2'
        elif '_bge_' in name.lower() and config['embedding_model'] is None:
            # bge in name but not as embedding = reranker
            config['reranker'] = 'bge'
        
        # Parse prompt
        for prompt in ['concise', 'structured', 'cot', 'fewshot_3', 'fewshot', 'extractive', 'cited']:
            if f'_{prompt}_' in name:
                config['prompt'] = prompt
                break
        
        # Extract full retriever name
        # Pattern: after model, before _k{n}_
        if k_match:
            retriever_match = re.search(r'Instruct_(.+?)_k\d+', name)
            if retriever_match:
                config['retriever'] = retriever_match.group(1)
    
    return config


def load_all_results(study_path: Path) -> pd.DataFrame:
    """Load all experiment results into a DataFrame."""
    results = []
    
    if not study_path.exists():
        print(f"Warning: Study path does not exist: {study_path}")
        return pd.DataFrame()
    
    for exp_dir in study_path.iterdir():
        if not exp_dir.is_dir():
            continue
        
        # Try results.json first, then metadata.json
        results_file = exp_dir / "results.json"
        metadata_file = exp_dir / "metadata.json"
        
        data = None
        if results_file.exists():
            with open(results_file) as f:
                data = json.load(f)
        elif metadata_file.exists():
            with open(metadata_file) as f:
                data = json.load(f)
            # Also load summary if exists
            summary_files = list(exp_dir.glob("*_summary.json"))
            if summary_files:
                with open(summary_files[0]) as f:
                    summary = json.load(f)
                data['metrics'] = summary.get('overall_metrics', summary)
        
        if data is None:
            continue
        
        try:
            # Parse experiment name
            exp_name = data.get('name', exp_dir.name)
            config = parse_experiment_name(exp_name)
            
            # Add metrics
            row = config.copy()
            metrics = data.get('metrics', data)
            for metric in METRICS:
                if metric in metrics:
                    row[metric] = metrics[metric]
                elif metric in data:
                    row[metric] = data[metric]
            
            # Add sample count and timing
            row['n_samples'] = data.get('n_samples', data.get('num_questions', None))
            row['duration'] = data.get('duration', 0)
            row['throughput'] = data.get('throughput_qps', 0)
            
            results.append(row)
        except Exception as e:
            print(f"Error loading {exp_dir.name}: {e}")
    
    df = pd.DataFrame(results)
    if not df.empty:
        df = df.sort_values(['exp_type', 'model_short', 'dataset']).reset_index(drop=True)
    
    return df


# Load data
df = load_all_results(STUDY_PATH)
print(f"Loaded {len(df)} experiments")

if len(df) > 0:
    print(f"\nExperiment types: {df['exp_type'].value_counts().to_dict()}")
    print(f"Models: {sorted(df['model_short'].dropna().unique())}")
    print(f"Datasets: {sorted(df['dataset'].dropna().unique())}")
    print(f"Retriever types: {df['retriever_type'].dropna().unique().tolist()}")
    print(f"\nMetrics available: {[m for m in METRICS if m in df.columns]}")

In [None]:
# Show experiment distribution (important for stratified sampling)
if len(df) > 0:
    print("Experiment Distribution (crucial for weighted analysis)")
    print("="*60)
    
    # By key dimensions
    for dim in ['model_short', 'retriever_type', 'query_transform', 'reranker', 'prompt', 'dataset']:
        if dim in df.columns:
            counts = df[dim].value_counts()
            print(f"\n{dim}:")
            for val, count in counts.items():
                print(f"  {val}: {count}")

## 2. Weighted Analysis Functions

With stratified sampling, we have non-uniform experiment counts.
We use **inverse frequency weighting** and **bootstrap confidence intervals**.

In [None]:
def weighted_mean_with_ci(
    df: pd.DataFrame, 
    group_col: str, 
    metric: str = PRIMARY_METRIC,
    weight_by: str = None,
    confidence: float = 0.95,
    n_bootstrap: int = 1000,
) -> pd.DataFrame:
    """
    Compute weighted mean with bootstrap confidence intervals.
    
    Args:
        df: DataFrame with experiment results
        group_col: Column to group by
        metric: Metric to analyze
        weight_by: Column to use for inverse frequency weighting (e.g., 'dataset')
        confidence: Confidence level for CI
        n_bootstrap: Number of bootstrap samples
    """
    if metric not in df.columns:
        return pd.DataFrame()
    
    results = []
    
    for group_val, group_df in df.groupby(group_col):
        values = group_df[metric].dropna().values
        if len(values) == 0:
            continue
        
        # Compute weights if specified
        if weight_by and weight_by in group_df.columns:
            # Inverse frequency weighting
            weight_counts = group_df[weight_by].value_counts()
            weights = group_df[weight_by].map(lambda x: 1.0 / weight_counts.get(x, 1))
            weights = weights / weights.sum()  # Normalize
            weighted_mean = (group_df[metric] * weights).sum()
        else:
            weighted_mean = np.mean(values)
            weights = None
        
        # Bootstrap CI
        if len(values) >= 3:
            bootstrap_means = []
            for _ in range(n_bootstrap):
                sample = np.random.choice(values, size=len(values), replace=True)
                bootstrap_means.append(np.mean(sample))
            alpha = (1 - confidence) / 2
            ci_low = np.percentile(bootstrap_means, alpha * 100)
            ci_high = np.percentile(bootstrap_means, (1 - alpha) * 100)
        else:
            ci_low = ci_high = weighted_mean
        
        results.append({
            group_col: group_val,
            'mean': weighted_mean,
            'std': np.std(values) if len(values) > 1 else 0,
            'ci_low': ci_low,
            'ci_high': ci_high,
            'n': len(values),
            'min': np.min(values),
            'max': np.max(values),
        })
    
    return pd.DataFrame(results).sort_values('mean', ascending=False).reset_index(drop=True)


def effect_size(baseline_values: np.ndarray, treatment_values: np.ndarray) -> Tuple[float, float, str]:
    """
    Compute Cohen's d effect size and interpret it.
    
    Returns: (effect_size, p_value, interpretation)
    """
    if len(baseline_values) < 2 or len(treatment_values) < 2:
        return 0, 1, 'insufficient data'
    
    # Cohen's d
    pooled_std = np.sqrt((
        (len(baseline_values) - 1) * np.var(baseline_values, ddof=1) + 
        (len(treatment_values) - 1) * np.var(treatment_values, ddof=1)
    ) / (len(baseline_values) + len(treatment_values) - 2))
    
    if pooled_std == 0:
        return 0, 1, 'no variance'
    
    d = (np.mean(treatment_values) - np.mean(baseline_values)) / pooled_std
    
    # t-test
    t_stat, p_value = stats.ttest_ind(treatment_values, baseline_values)
    
    # Interpret
    if abs(d) < 0.2:
        interpretation = 'negligible'
    elif abs(d) < 0.5:
        interpretation = 'small'
    elif abs(d) < 0.8:
        interpretation = 'medium'
    else:
        interpretation = 'large'
    
    return d, p_value, interpretation

## 3. RAG vs Direct LLM Analysis

In [None]:
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    direct_df = df[df['exp_type'] == 'direct']
    rag_df = df[df['exp_type'] == 'rag']
    
    print("RAG vs Direct LLM Comparison")
    print("="*60)
    
    if len(direct_df) > 0 and len(rag_df) > 0:
        direct_mean = direct_df[PRIMARY_METRIC].mean()
        rag_mean = rag_df[PRIMARY_METRIC].mean()
        
        d, p, interp = effect_size(
            direct_df[PRIMARY_METRIC].dropna().values,
            rag_df[PRIMARY_METRIC].dropna().values
        )
        
        print(f"Direct LLM: {direct_mean:.4f} (n={len(direct_df)})")
        print(f"RAG:        {rag_mean:.4f} (n={len(rag_df)})")
        print(f"\nImprovement: {rag_mean - direct_mean:+.4f} ({(rag_mean/direct_mean - 1)*100:+.1f}%)")
        print(f"Effect size (Cohen's d): {d:.3f} ({interp})")
        print(f"P-value: {p:.4f} {'âœ“ significant' if p < 0.05 else 'âœ— not significant'}")
    else:
        print("Need both direct and RAG experiments for comparison")

In [None]:
# RAG vs Direct by model and dataset
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    direct_df = df[df['exp_type'] == 'direct']
    rag_df = df[df['exp_type'] == 'rag']
    
    if len(direct_df) > 0 and len(rag_df) > 0:
        comparisons = []
        for model in df['model_short'].dropna().unique():
            for dataset in df['dataset'].dropna().unique():
                direct_vals = direct_df[(direct_df['model_short'] == model) & 
                                        (direct_df['dataset'] == dataset)][PRIMARY_METRIC].dropna()
                rag_vals = rag_df[(rag_df['model_short'] == model) & 
                                  (rag_df['dataset'] == dataset)][PRIMARY_METRIC].dropna()
                
                if len(direct_vals) > 0 and len(rag_vals) > 0:
                    comparisons.append({
                        'model': model,
                        'dataset': dataset,
                        'direct': direct_vals.mean(),
                        'rag_mean': rag_vals.mean(),
                        'rag_best': rag_vals.max(),
                        'improvement': rag_vals.mean() - direct_vals.mean(),
                        'n_rag': len(rag_vals),
                    })
        
        if comparisons:
            comp_df = pd.DataFrame(comparisons)
            print("\nRAG vs Direct by Model Ã— Dataset")
            display(comp_df.round(4))
            
            # Visualize
            fig, axes = plt.subplots(1, 2, figsize=(14, 5))
            
            # Grouped bar by model
            model_comp = comp_df.groupby('model').agg({
                'direct': 'mean',
                'rag_mean': 'mean',
                'rag_best': 'mean',
            })
            model_comp.plot(kind='bar', ax=axes[0], width=0.7)
            axes[0].set_title(f'RAG vs Direct by Model ({PRIMARY_METRIC})')
            axes[0].set_ylabel(PRIMARY_METRIC.upper())
            axes[0].legend(['Direct', 'RAG Mean', 'RAG Best'])
            axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
            
            # Improvement heatmap
            pivot = comp_df.pivot(index='model', columns='dataset', values='improvement')
            sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', center=0, ax=axes[1])
            axes[1].set_title(f'{PRIMARY_METRIC} Improvement (RAG - Direct)')
            
            plt.tight_layout()
            plt.show()

## 4. Component Effect Analysis

Analyze the marginal effect of each component while controlling for others.

In [None]:
def analyze_component_effects(
    df: pd.DataFrame,
    metric: str = PRIMARY_METRIC,
    components: List[str] = None,
) -> pd.DataFrame:
    """
    Analyze the effect of each component on the metric.
    Reports: mean, CI, effect size vs baseline, significance.
    """
    if components is None:
        components = ['model_short', 'retriever_type', 'embedding_model', 
                      'query_transform', 'reranker', 'prompt', 'top_k']
    
    rag_df = df[df['exp_type'] == 'rag'].copy()
    if metric not in rag_df.columns or len(rag_df) == 0:
        return pd.DataFrame()
    
    all_effects = []
    
    for comp in components:
        if comp not in rag_df.columns:
            continue
        
        # Get baseline (most common or 'none')
        value_counts = rag_df[comp].value_counts()
        if len(value_counts) < 2:
            continue
        
        if 'none' in value_counts.index:
            baseline_val = 'none'
        else:
            baseline_val = value_counts.index[0]
        
        baseline_scores = rag_df[rag_df[comp] == baseline_val][metric].dropna().values
        
        for val in value_counts.index:
            if val == baseline_val:
                continue
            
            treatment_scores = rag_df[rag_df[comp] == val][metric].dropna().values
            
            if len(treatment_scores) < 2:
                continue
            
            d, p, interp = effect_size(baseline_scores, treatment_scores)
            
            all_effects.append({
                'component': comp,
                'baseline': baseline_val,
                'treatment': val,
                'baseline_mean': np.mean(baseline_scores),
                'treatment_mean': np.mean(treatment_scores),
                'improvement': np.mean(treatment_scores) - np.mean(baseline_scores),
                'effect_size': d,
                'effect_interp': interp,
                'p_value': p,
                'significant': p < 0.05,
                'n_baseline': len(baseline_scores),
                'n_treatment': len(treatment_scores),
            })
    
    return pd.DataFrame(all_effects).sort_values('effect_size', ascending=False).reset_index(drop=True)


if len(df) > 0 and PRIMARY_METRIC in df.columns:
    effects_df = analyze_component_effects(df)
    if len(effects_df) > 0:
        print(f"Component Effects on {PRIMARY_METRIC}")
        print("="*80)
        print("Positive effect_size = treatment better than baseline\n")
        display(effects_df[['component', 'baseline', 'treatment', 'improvement', 
                           'effect_size', 'effect_interp', 'p_value', 'significant', 
                           'n_baseline', 'n_treatment']].round(4))

In [None]:
# Visualize component effects
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    effects_df = analyze_component_effects(df)
    if len(effects_df) > 0:
        # Filter to significant effects
        sig_effects = effects_df[effects_df['significant']].copy()
        
        if len(sig_effects) > 0:
            fig, ax = plt.subplots(figsize=(12, 6))
            
            # Create labels
            sig_effects['label'] = sig_effects.apply(
                lambda r: f"{r['component']}:\n{r['baseline']}â†’{r['treatment']}", axis=1
            )
            
            colors = ['green' if x > 0 else 'red' for x in sig_effects['effect_size']]
            
            bars = ax.barh(sig_effects['label'], sig_effects['effect_size'], color=colors, alpha=0.7)
            ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
            ax.set_xlabel("Cohen's d (Effect Size)")
            ax.set_title(f"Significant Component Effects on {PRIMARY_METRIC} (p<0.05)")
            
            # Add effect size labels
            for bar, interp in zip(bars, sig_effects['effect_interp']):
                width = bar.get_width()
                ax.annotate(f'{interp}',
                           xy=(width, bar.get_y() + bar.get_height()/2),
                           xytext=(3, 0), textcoords='offset points',
                           ha='left' if width > 0 else 'right', va='center', fontsize=9)
            
            plt.tight_layout()
            plt.show()
        else:
            print("No statistically significant effects found (p<0.05)")

## 5. Model Comparison (with non-uniform weighting)

In [None]:
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    rag_df = df[df['exp_type'] == 'rag']
    
    if len(rag_df) > 0:
        # Weighted by dataset to account for non-uniform sampling
        model_stats = weighted_mean_with_ci(rag_df, 'model_short', PRIMARY_METRIC, weight_by='dataset')
        
        print(f"Model Comparison (weighted by dataset)")
        print("="*60)
        display(model_stats.round(4))
        
        # Visualize with error bars
        fig, ax = plt.subplots(figsize=(10, 5))
        
        x = range(len(model_stats))
        ax.bar(x, model_stats['mean'], 
               yerr=[model_stats['mean'] - model_stats['ci_low'], 
                     model_stats['ci_high'] - model_stats['mean']],
               capsize=5, alpha=0.7)
        ax.set_xticks(x)
        ax.set_xticklabels(model_stats['model_short'])
        ax.set_ylabel(PRIMARY_METRIC.upper())
        ax.set_title(f'Model Performance with 95% CI ({PRIMARY_METRIC})')
        
        for i, row in model_stats.iterrows():
            ax.annotate(f"{row['mean']:.3f}\n(n={row['n']})",
                       xy=(i, row['mean']), ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()

## 6. Retrieval Strategy Comparison

In [None]:
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    rag_df = df[df['exp_type'] == 'rag']
    
    for dimension in ['retriever_type', 'embedding_model', 'query_transform', 'reranker', 'top_k']:
        if dimension not in rag_df.columns:
            continue
        if rag_df[dimension].dropna().nunique() < 2:
            continue
        
        stats = weighted_mean_with_ci(rag_df, dimension, PRIMARY_METRIC, weight_by='dataset')
        
        if len(stats) > 0:
            print(f"\n{dimension.replace('_', ' ').title()}")
            print("-"*50)
            display(stats.round(4))

In [None]:
# Heatmaps for key interactions
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    rag_df = df[df['exp_type'] == 'rag']
    
    if len(rag_df) > 10:
        fig, axes = plt.subplots(2, 2, figsize=(14, 12))
        
        interactions = [
            ('model_short', 'retriever_type'),
            ('model_short', 'query_transform'),
            ('retriever_type', 'embedding_model'),
            ('query_transform', 'reranker'),
        ]
        
        for ax, (row_dim, col_dim) in zip(axes.flatten(), interactions):
            if row_dim not in rag_df.columns or col_dim not in rag_df.columns:
                ax.set_visible(False)
                continue
            
            pivot = rag_df.pivot_table(
                index=row_dim, columns=col_dim, values=PRIMARY_METRIC, aggfunc='mean'
            )
            
            if pivot.empty:
                ax.set_visible(False)
                continue
            
            sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', ax=ax)
            ax.set_title(f'{row_dim} Ã— {col_dim}')
        
        plt.suptitle(f'Interaction Effects ({PRIMARY_METRIC})', fontsize=14, y=1.02)
        plt.tight_layout()
        plt.show()

## 7. Prompt Strategy Analysis

In [None]:
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    # Combine direct and RAG for prompt analysis
    prompt_stats = weighted_mean_with_ci(df, 'prompt', PRIMARY_METRIC, weight_by='dataset')
    
    if len(prompt_stats) > 0:
        print("Prompt Strategy Comparison")
        print("="*60)
        display(prompt_stats.round(4))
        
        # By experiment type
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Overall
        ax = axes[0]
        ax.bar(prompt_stats['prompt'], prompt_stats['mean'],
               yerr=[prompt_stats['mean'] - prompt_stats['ci_low'],
                     prompt_stats['ci_high'] - prompt_stats['mean']],
               capsize=5, alpha=0.7)
        ax.set_ylabel(PRIMARY_METRIC.upper())
        ax.set_title('Prompt Performance (Overall)')
        ax.set_xticklabels(prompt_stats['prompt'], rotation=45, ha='right')
        
        # By type (Direct vs RAG)
        ax = axes[1]
        type_prompt = df.groupby(['exp_type', 'prompt'])[PRIMARY_METRIC].mean().unstack()
        type_prompt.plot(kind='bar', ax=ax, width=0.7)
        ax.set_ylabel(PRIMARY_METRIC.upper())
        ax.set_title('Prompt Ã— Experiment Type')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
        ax.legend(title='Prompt')
        
        plt.tight_layout()
        plt.show()

## 8. Dataset-Specific Analysis

In [None]:
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    rag_df = df[df['exp_type'] == 'rag']
    
    print("Best Configurations per Dataset")
    print("="*60)
    
    for dataset in df['dataset'].dropna().unique():
        ds_df = rag_df[rag_df['dataset'] == dataset]
        if len(ds_df) == 0:
            continue
        
        best_idx = ds_df[PRIMARY_METRIC].idxmax()
        best = ds_df.loc[best_idx]
        
        print(f"\n{dataset.upper()}:")
        print(f"  Best {PRIMARY_METRIC}: {best[PRIMARY_METRIC]:.4f}")
        print(f"  Model: {best['model_short']}")
        print(f"  Retriever: {best.get('retriever_type', 'N/A')}")
        print(f"  Embedding: {best.get('embedding_model', 'N/A')}")
        print(f"  Query transform: {best.get('query_transform', 'none')}")
        print(f"  Reranker: {best.get('reranker', 'none')}")
        print(f"  Prompt: {best.get('prompt', 'N/A')}")

In [None]:
# Dataset difficulty comparison
if len(df) > 0 and PRIMARY_METRIC in df.columns:
    dataset_stats = weighted_mean_with_ci(df, 'dataset', PRIMARY_METRIC)
    
    print("\nDataset Difficulty (lower = harder)")
    print("="*60)
    display(dataset_stats.round(4))
    
    # Which strategies work best on hardest dataset?
    hardest = dataset_stats.iloc[-1]['dataset']
    print(f"\nStrategies on hardest dataset ({hardest}):")
    
    rag_df = df[(df['exp_type'] == 'rag') & (df['dataset'] == hardest)]
    if len(rag_df) > 0:
        for comp in ['retriever_type', 'query_transform', 'reranker']:
            if comp in rag_df.columns:
                stats = rag_df.groupby(comp)[PRIMARY_METRIC].mean().sort_values(ascending=False)
                best = stats.index[0]
                print(f"  Best {comp}: {best} ({stats.iloc[0]:.4f})")

## 9. Bottleneck Identification

In [None]:
def identify_bottlenecks(df: pd.DataFrame, metric: str = PRIMARY_METRIC) -> Dict[str, Any]:
    """
    Identify bottlenecks in the RAG pipeline.
    
    Returns analysis of where improvements would have biggest impact.
    """
    rag_df = df[df['exp_type'] == 'rag'].copy()
    if len(rag_df) < 10:
        return {}
    
    # Compute variance explained by each component
    total_var = rag_df[metric].var()
    
    variance_explained = {}
    components = ['model_short', 'retriever_type', 'embedding_model', 
                  'query_transform', 'reranker', 'prompt', 'top_k', 'dataset']
    
    for comp in components:
        if comp not in rag_df.columns:
            continue
        if rag_df[comp].dropna().nunique() < 2:
            continue
        
        # Between-group variance
        group_means = rag_df.groupby(comp)[metric].mean()
        grand_mean = rag_df[metric].mean()
        group_sizes = rag_df.groupby(comp).size()
        
        ss_between = sum(group_sizes[g] * (group_means[g] - grand_mean)**2 
                        for g in group_means.index)
        
        variance_explained[comp] = ss_between / (total_var * len(rag_df)) if total_var > 0 else 0
    
    # Sort by impact
    sorted_components = sorted(variance_explained.items(), key=lambda x: x[1], reverse=True)
    
    return {
        'variance_explained': dict(sorted_components),
        'top_bottleneck': sorted_components[0][0] if sorted_components else None,
        'total_experiments': len(rag_df),
    }


if len(df) > 0 and PRIMARY_METRIC in df.columns:
    bottlenecks = identify_bottlenecks(df)
    
    if bottlenecks:
        print("Bottleneck Analysis (Variance Explained)")
        print("="*60)
        print("Higher = more impact on performance variance\n")
        
        var_exp = bottlenecks['variance_explained']
        for comp, var in var_exp.items():
            bar = 'â–ˆ' * int(var * 50)
            print(f"{comp:20s} {var:6.1%} {bar}")
        
        print(f"\nðŸŽ¯ Top bottleneck: {bottlenecks['top_bottleneck']}")
        print("   â†’ Focus optimization efforts here for biggest gains")

## 10. Recommendations

In [None]:
def generate_recommendations(df: pd.DataFrame, metric: str = PRIMARY_METRIC):
    """Generate actionable recommendations based on analysis."""
    print("="*70)
    print("ðŸ“‹ RECOMMENDATIONS")
    print("="*70)
    
    rag_df = df[df['exp_type'] == 'rag']
    direct_df = df[df['exp_type'] == 'direct']
    
    if len(rag_df) == 0:
        print("Insufficient RAG experiments for recommendations.")
        return
    
    # 1. Best overall
    best_idx = rag_df[metric].idxmax()
    best = rag_df.loc[best_idx]
    print(f"\n1. BEST OVERALL CONFIGURATION ({metric}={best[metric]:.4f})")
    print(f"   Model: {best['model_short']}")
    print(f"   Retriever: {best.get('retriever_type', 'N/A')} / {best.get('embedding_model', 'N/A')}")
    print(f"   Query: {best.get('query_transform', 'none')}, Reranker: {best.get('reranker', 'none')}")
    print(f"   Prompt: {best.get('prompt', 'N/A')}, top_k: {best.get('top_k', 'N/A')}")
    
    # 2. Component recommendations
    effects_df = analyze_component_effects(df, metric)
    if len(effects_df) > 0:
        sig_positive = effects_df[(effects_df['significant']) & (effects_df['improvement'] > 0)]
        if len(sig_positive) > 0:
            print(f"\n2. SIGNIFICANT IMPROVEMENTS (p<0.05)")
            for _, row in sig_positive.head(5).iterrows():
                print(f"   âœ“ {row['component']}: {row['baseline']} â†’ {row['treatment']}")
                print(f"     Effect: {row['improvement']:+.4f} ({row['effect_interp']})")
    
    # 3. Bottleneck
    bottlenecks = identify_bottlenecks(df, metric)
    if bottlenecks and bottlenecks['top_bottleneck']:
        print(f"\n3. PRIORITY FOR OPTIMIZATION")
        print(f"   Focus on: {bottlenecks['top_bottleneck']}")
        var = bottlenecks['variance_explained'].get(bottlenecks['top_bottleneck'], 0)
        print(f"   Explains {var:.1%} of performance variance")
    
    # 4. Dataset-specific
    print(f"\n4. DATASET-SPECIFIC INSIGHTS")
    for dataset in df['dataset'].dropna().unique():
        ds_rag = rag_df[rag_df['dataset'] == dataset]
        ds_direct = direct_df[direct_df['dataset'] == dataset]
        
        if len(ds_rag) > 0 and len(ds_direct) > 0:
            rag_best = ds_rag[metric].max()
            direct_best = ds_direct[metric].max()
            improvement = (rag_best - direct_best) / direct_best * 100 if direct_best > 0 else 0
            print(f"   {dataset}: RAG improves by {improvement:+.1f}% over direct")
    
    # 5. Quick wins
    print(f"\n5. QUICK WINS (low complexity, positive effect)")
    quick_wins = [
        ('prompt', 'Try structured or cot prompts'),
        ('reranker', 'Add bge-v2 reranker'),
        ('query_transform', 'Enable HyDE'),
    ]
    for comp, desc in quick_wins:
        if comp in effects_df['component'].values:
            comp_effects = effects_df[effects_df['component'] == comp]
            best_effect = comp_effects.iloc[0] if len(comp_effects) > 0 else None
            if best_effect is not None and best_effect['improvement'] > 0:
                print(f"   âœ“ {desc}: +{best_effect['improvement']:.4f} {metric}")


if len(df) > 0 and PRIMARY_METRIC in df.columns:
    generate_recommendations(df)

## 11. Export Analysis Results

In [None]:
if len(df) > 0:
    output_dir = STUDY_PATH / "analysis"
    output_dir.mkdir(exist_ok=True)
    
    # Full results
    df.to_csv(output_dir / "full_results.csv", index=False)
    
    # Component effects
    if PRIMARY_METRIC in df.columns:
        effects_df = analyze_component_effects(df)
        if len(effects_df) > 0:
            effects_df.to_csv(output_dir / "component_effects.csv", index=False)
    
    # Bottleneck analysis
    bottlenecks = identify_bottlenecks(df)
    if bottlenecks:
        with open(output_dir / "bottleneck_analysis.json", 'w') as f:
            json.dump(bottlenecks, f, indent=2)
    
    print(f"Results exported to: {output_dir}")