# Experiment Failure Analysis

This notebook analyzes failed experiments to identify patterns and root causes.

Common failure modes:
- **Context length exceeded**: Hierarchical retrieval + high top_k + small context models
- **OOM errors**: GPU memory exhaustion during generation
- **Partial completions**: Experiments that started but didn't finish

In [None]:
import sys
sys.path.insert(0, '..')

from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from analysis_utils import (
    load_failed_experiments,
    analyze_failure_patterns,
    get_experiment_health_summary,
    predict_context_length_issues,
    DEFAULT_STUDY_PATH,
)

pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

print(f"Study path: {DEFAULT_STUDY_PATH}")

## 1. Health Summary

Quick overview of experiment status across the study.

In [None]:
health = get_experiment_health_summary()

print("=" * 50)
print("EXPERIMENT HEALTH SUMMARY")
print("=" * 50)
for key, value in health.items():
    print(f"{key:20}: {value}")

# Calculate percentages
if health.get('total_experiments', 0) > 0:
    total = health['total_experiments']
    print("\n--- Percentages ---")
    print(f"Complete:    {health.get('complete', 0) / total * 100:.1f}%")
    print(f"Failed:      {health.get('failed', 0) / total * 100:.1f}%")
    print(f"In Progress: {health.get('in_progress', 0) / total * 100:.1f}%")

## 2. Failed Experiments

Load all experiments that have `phase: failed` in their state.json.

In [None]:
failed_df = load_failed_experiments()

print(f"Total failed experiments: {len(failed_df)}")

if not failed_df.empty:
    display(failed_df.head(20))

## 3. Failure Pattern Analysis

Categorize failures by type and identify systematic issues.

In [None]:
if not failed_df.empty:
    patterns = analyze_failure_patterns(failed_df)
    
    print("=" * 50)
    print("FAILURE PATTERNS")
    print("=" * 50)
    print(f"Total failed: {patterns['total_failed']}")
    print(f"Context length issues: {patterns['context_length_issues']}")
    print(f"OOM issues: {patterns['oom_issues']}")
    print(f"Partial completions: {len(patterns['partial_completions'])}")
else:
    print("No failed experiments found!")

### 3.1 Failures by Model

In [None]:
if not failed_df.empty:
    model_failures = failed_df['model_short'].value_counts()
    
    fig, ax = plt.subplots(figsize=(10, 5))
    model_failures.plot(kind='bar', ax=ax, color='coral')
    ax.set_title('Failures by Model')
    ax.set_xlabel('Model')
    ax.set_ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print("\nFailures by model:")
    print(model_failures.to_string())

### 3.2 Failures by Retriever Type

In [None]:
if not failed_df.empty:
    retriever_failures = failed_df['retriever_type'].value_counts()
    
    fig, ax = plt.subplots(figsize=(8, 5))
    retriever_failures.plot(kind='bar', ax=ax, color='steelblue')
    ax.set_title('Failures by Retriever Type')
    ax.set_xlabel('Retriever Type')
    ax.set_ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print("\nFailures by retriever:")
    print(retriever_failures.to_string())

### 3.3 Failures by Top-K

In [None]:
if not failed_df.empty:
    topk_failures = failed_df['top_k'].value_counts().sort_index()
    
    fig, ax = plt.subplots(figsize=(8, 5))
    topk_failures.plot(kind='bar', ax=ax, color='forestgreen')
    ax.set_title('Failures by Top-K')
    ax.set_xlabel('Top-K')
    ax.set_ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    print("\nFailures by top_k:")
    print(topk_failures.to_string())

### 3.4 Cross-tabulation: Model × Retriever

In [None]:
if not failed_df.empty and len(failed_df) > 5:
    cross_tab = pd.crosstab(failed_df['model_short'], failed_df['retriever_type'])
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(cross_tab, annot=True, fmt='d', cmap='Reds', ax=ax)
    ax.set_title('Failure Heatmap: Model × Retriever')
    plt.tight_layout()
    plt.show()
    
    print("\nCross-tabulation:")
    display(cross_tab)

## 4. Error Messages

Examine the actual error messages to understand root causes.

In [None]:
if not failed_df.empty:
    # Group by error message (truncated for readability)
    failed_df['error_short'] = failed_df['error'].str[:80]
    error_counts = failed_df['error_short'].value_counts()
    
    print("=" * 50)
    print("ERROR MESSAGE FREQUENCY")
    print("=" * 50)
    for error, count in error_counts.head(10).items():
        print(f"\n[{count}x] {error}...")

In [None]:
if not failed_df.empty:
    # Show full error messages for unique errors
    print("\nUnique error messages:")
    print("=" * 50)
    for i, error in enumerate(failed_df['error'].unique()[:5]):
        print(f"\n[Error {i+1}]")
        print(error[:500] if len(error) > 500 else error)
        print("-" * 50)

## 5. Context Length Risk Prediction

Identify experiments that are likely to fail due to context length limits.

**Estimation:**
- Dense/Hybrid: ~512 tokens/doc
- Hierarchical: ~2048 tokens/doc (parent chunks)
- Prompt overhead: ~200 tokens

In [None]:
risky_df = predict_context_length_issues()

print(f"Experiments at risk of context length failure: {len(risky_df)}")

if not risky_df.empty:
    display(risky_df.sort_values('headroom_pct'))

In [None]:
if not risky_df.empty:
    # Group by model to see which models are most affected
    model_risk = risky_df.groupby('model').size().sort_values(ascending=False)
    
    print("\nRisky experiments by model:")
    print(model_risk.to_string())
    
    # Group by retriever
    retriever_risk = risky_df.groupby('retriever_type').size().sort_values(ascending=False)
    
    print("\nRisky experiments by retriever:")
    print(retriever_risk.to_string())

## 6. Partial Completions

Experiments that started but didn't finish - may be recoverable.

In [None]:
if not failed_df.empty:
    partial = failed_df[failed_df['predictions_complete'] > 0].copy()
    
    if not partial.empty:
        partial['completion_pct'] = partial['predictions_complete'] / partial['total_questions'] * 100
        partial = partial.sort_values('completion_pct', ascending=False)
        
        print(f"Partial completions: {len(partial)}")
        display(partial[['name', 'model_short', 'predictions_complete', 'total_questions', 'completion_pct', 'error']].head(20))
    else:
        print("No partial completions found.")

## 7. Recommendations

Based on the failure analysis, here are recommendations:

In [None]:
print("=" * 60)
print("RECOMMENDATIONS")
print("=" * 60)

if not failed_df.empty:
    # Check for context length issues
    context_fails = failed_df[failed_df['error'].str.contains('context|length|token|truncat', case=False, na=False)]
    if len(context_fails) > 0:
        print("\n1. CONTEXT LENGTH ISSUES DETECTED")
        print(f"   {len(context_fails)} experiments failed due to context length.")
        print("   Recommended actions:")
        print("   - Exclude hierarchical retriever for Phi-3-mini-4k (4K context)")
        print("   - Reduce top_k for hierarchical retrieval")
        print("   - Use Phi-3-mini-128k variant if available")
    
    # Check for OOM issues
    oom_fails = failed_df[failed_df['error'].str.contains('OOM|out of memory|CUDA', case=False, na=False)]
    if len(oom_fails) > 0:
        print("\n2. GPU MEMORY ISSUES DETECTED")
        print(f"   {len(oom_fails)} experiments failed due to OOM.")
        print("   Recommended actions:")
        print("   - Reduce batch_size in config")
        print("   - Use quantization (AWQ, GPTQ)")
        print("   - Increase GPU memory or use tensor parallelism")
    
    # Check for model-specific failures
    model_fail_pct = failed_df['model_short'].value_counts() / len(failed_df) * 100
    high_fail_models = model_fail_pct[model_fail_pct > 30]
    if len(high_fail_models) > 0:
        print("\n3. HIGH-FAILURE MODELS")
        for model, pct in high_fail_models.items():
            print(f"   - {model}: {pct:.1f}% of all failures")
        print("   Consider investigating these models specifically.")
else:
    print("\nNo failed experiments - all experiments completed successfully!")

print("\n" + "=" * 60)

## 8. Export Failed Experiments

Export the list of failed experiments for further investigation or re-running.

In [None]:
if not failed_df.empty:
    output_path = DEFAULT_STUDY_PATH / 'failed_experiments.csv'
    failed_df.to_csv(output_path, index=False)
    print(f"Exported failed experiments to: {output_path}")
    
    # Also export just the names for easy re-running
    names_path = DEFAULT_STUDY_PATH / 'failed_experiment_names.txt'
    with open(names_path, 'w') as f:
        for name in failed_df['name']:
            f.write(f"{name}\n")
    print(f"Exported experiment names to: {names_path}")