# Experiment 1 Analysis: Needle in Haystack (Lost in the Middle)

This notebook analyzes results from Experiment 1, which tests the "Lost in the Middle" phenomenon where LLMs struggle to retrieve information from the middle of long contexts.

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Results

In [None]:
# Load raw results
results_path = Path("../results/experiment1/raw_results.json")

with open(results_path, 'r') as f:
    results = json.load(f)

# Create DataFrame from trials
df = pd.DataFrame(results['trials'])

print(f"Total trials: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Summary Statistics

In [None]:
# Overall statistics
print("=== Overall Statistics ===")
print(f"Mean Accuracy: {results['statistics']['mean_accuracy']:.2%}")
print(f"Std Accuracy: {results['statistics']['std_accuracy']:.3f}")
print(f"Mean Latency: {results['statistics']['mean_latency']:.0f}ms")
print(f"Confidence Interval (95%): {results['statistics']['confidence_interval_95']}")

# By position
print("\n=== Accuracy by Position ===")
position_stats = df.groupby('position')['accuracy'].agg(['mean', 'std', 'count'])
position_stats['mean'] = position_stats['mean'] * 100  # Convert to percentage
print(position_stats)

## 3. Visualizations

In [None]:
# Box plot by position
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy by position
df['accuracy_pct'] = df['accuracy'] * 100
sns.boxplot(data=df, x='position', y='accuracy_pct', ax=axes[0])
axes[0].set_title('Accuracy Distribution by Position', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy (%)', fontsize=12)
axes[0].set_xlabel('Fact Position', fontsize=12)
axes[0].set_ylim(0, 100)

# Latency by position
sns.boxplot(data=df[df['error'].isna()], x='position', y='latency_ms', ax=axes[1])
axes[1].set_title('Latency Distribution by Position', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Latency (ms)', fontsize=12)
axes[1].set_xlabel('Fact Position', fontsize=12)

plt.tight_layout()
plt.show()

## 4. Statistical Significance Testing

In [None]:
from scipy import stats

# Get accuracy for each position
start_acc = df[df['position'] == 'start']['accuracy'].values
middle_acc = df[df['position'] == 'middle']['accuracy'].values
end_acc = df[df['position'] == 'end']['accuracy'].values

# T-tests
print("=== Statistical Significance Tests ===")
print("\nStart vs Middle:")
t_stat, p_value = stats.ttest_ind(start_acc, middle_acc)
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant at p<0.05: {'Yes' if p_value < 0.05 else 'No'}")

print("\nMiddle vs End:")
t_stat, p_value = stats.ttest_ind(middle_acc, end_acc)
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant at p<0.05: {'Yes' if p_value < 0.05 else 'No'}")

## 5. Error Analysis

In [None]:
# Check for errors
errors = df[df['error'].notna()]
print(f"Total errors: {len(errors)}")

if len(errors) > 0:
    print("\nError details:")
    print(errors[['trial_id', 'position', 'error']])
else:
    print("No errors occurred during the experiment!")

## 6. Conclusions

Based on the analysis:

1. **Lost in the Middle Effect**: [To be filled based on results]
2. **Statistical Significance**: [To be filled based on results]
3. **Performance Implications**: [To be filled based on results]