## üì¶ 1. Import Required Libraries

In [None]:
import os
import sys
import json
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr

# Suppress warnings
warnings.filterwarnings('ignore')

# Setup display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: f'{x:.4f}')

# Setup seaborn style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 5)

print("‚úÖ Imports complete!")

## üìÇ 2. Load Sample Evaluation Results

In [None]:
# Define paths to results files
# Change these paths to match your actual result files
RAGAS_RESULTS_PATH = '../data/open_rag_eval_results_raptor/eval_results_raptor.json'
DEEPEVAL_RESULTS_PATH = '../data/open_rag_eval_results_raptor/deepeval_results.json'

# Check if files exist
ragas_exists = Path(RAGAS_RESULTS_PATH).exists()
deepeval_exists = Path(DEEPEVAL_RESULTS_PATH).exists()

print(f"RAGAS results: {'‚úÖ Found' if ragas_exists else '‚ùå Not found'}")
print(f"DeepEval results: {'‚úÖ Found' if deepeval_exists else '‚ùå Not found'}")

if not (ragas_exists and deepeval_exists):
    print("\n‚ö†Ô∏è Please run RAGAS and DeepEval evaluation first:")
    print("  1. python scripts/evaluate_ragas.py")
    print("  2. python scripts/evaluate_deepeval.py")

In [None]:
# Load RAGAS results
with open(RAGAS_RESULTS_PATH, 'r', encoding='utf-8-sig') as f:
    ragas_data = json.load(f)

# Load DeepEval results
with open(DEEPEVAL_RESULTS_PATH, 'r', encoding='utf-8-sig') as f:
    deepeval_data = json.load(f)

print(f"üìä Loaded {len(ragas_data)} RAGAS results")
print(f"üìä Loaded {len(deepeval_data)} DeepEval results")

# Show sample structure
print("\nüìã RAGAS sample:")
if ragas_data:
    print(json.dumps(ragas_data[0], indent=2, ensure_ascii=False)[:300])

print("\nüìã DeepEval sample:")
if deepeval_data:
    print(json.dumps(deepeval_data[0], indent=2, ensure_ascii=False)[:300])

## üîó 3. Merge & Prepare Data

In [None]:
# Convert to dictionaries keyed by testcase_id
ragas_dict = {str(r.get('testcase_id', '')): r for r in ragas_data}
deepeval_dict = {str(d.get('testcase_id', '')): d for d in deepeval_data}

# Find common testcase IDs
common_ids = set(ragas_dict.keys()) & set(deepeval_dict.keys())
print(f"\nüìä Testcase ID Analysis:")
print(f"  Total RAGAS: {len(ragas_dict)}")
print(f"  Total DeepEval: {len(deepeval_dict)}")
print(f"  Common IDs: {len(common_ids)}")
print(f"  RAGAS only: {len(set(ragas_dict.keys()) - set(deepeval_dict.keys()))}")
print(f"  DeepEval only: {len(set(deepeval_dict.keys()) - set(ragas_dict.keys()))}")

# Create comparison dataframe
rows = []
for tc_id in sorted(common_ids):
    r = ragas_dict[tc_id]
    d = deepeval_dict[tc_id]
    
    row = {
        'testcase_id': tc_id,
        'ragas_faithfulness': r.get('faithfulness'),
        'ragas_answer_relevancy': r.get('answer_relevancy'),
        'deepeval_faithfulness': d.get('faithfulness'),
        'deepeval_answer_relevancy': d.get('answer_relevancy'),
    }
    rows.append(row)

comparison_df = pd.DataFrame(rows)
print(f"\n‚úÖ Created comparison dataframe with {len(comparison_df)} rows")
print(f"\n{comparison_df.head(10)}")

## üìä 4. Analyze Score Distributions

In [None]:
# Summary statistics
print("üìà SUMMARY STATISTICS:\n")
print(comparison_df[[
    'ragas_faithfulness',
    'ragas_answer_relevancy',
    'deepeval_faithfulness',
    'deepeval_answer_relevancy'
]].describe())

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# RAGAS Faithfulness
valid_ragas_faith = comparison_df['ragas_faithfulness'].dropna()
axes[0, 0].hist(valid_ragas_faith, bins=20, alpha=0.7, color='blue', edgecolor='black')
axes[0, 0].set_title('RAGAS Faithfulness Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].axvline(valid_ragas_faith.mean(), color='red', linestyle='--', label=f'Mean: {valid_ragas_faith.mean():.3f}')
axes[0, 0].legend()

# DeepEval Faithfulness
valid_deepeval_faith = comparison_df['deepeval_faithfulness'].dropna()
axes[0, 1].hist(valid_deepeval_faith, bins=20, alpha=0.7, color='green', edgecolor='black')
axes[0, 1].set_title('DeepEval Faithfulness Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].axvline(valid_deepeval_faith.mean(), color='red', linestyle='--', label=f'Mean: {valid_deepeval_faith.mean():.3f}')
axes[0, 1].legend()

# RAGAS Answer Relevancy
valid_ragas_rel = comparison_df['ragas_answer_relevancy'].dropna()
axes[1, 0].hist(valid_ragas_rel, bins=20, alpha=0.7, color='purple', edgecolor='black')
axes[1, 0].set_title('RAGAS Answer Relevancy Distribution', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].axvline(valid_ragas_rel.mean(), color='red', linestyle='--', label=f'Mean: {valid_ragas_rel.mean():.3f}')
axes[1, 0].legend()

# DeepEval Answer Relevancy
valid_deepeval_rel = comparison_df['deepeval_answer_relevancy'].dropna()
axes[1, 1].hist(valid_deepeval_rel, bins=20, alpha=0.7, color='orange', edgecolor='black')
axes[1, 1].set_title('DeepEval Answer Relevancy Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Score')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axvline(valid_deepeval_rel.mean(), color='red', linestyle='--', label=f'Mean: {valid_deepeval_rel.mean():.3f}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("‚úÖ Distributions plotted")

## üîó 5. Correlation Analysis

In [None]:
# Compute correlations for each metric
print("üìä CORRELATION ANALYSIS:\n")

correlation_results = {}

# Faithfulness correlation
valid_faith = comparison_df[['ragas_faithfulness', 'deepeval_faithfulness']].dropna()
if len(valid_faith) >= 2:
    pearson_faith, p_faith = pearsonr(valid_faith['ragas_faithfulness'], 
                                       valid_faith['deepeval_faithfulness'])
    spearman_faith, sp_faith = spearmanr(valid_faith['ragas_faithfulness'], 
                                          valid_faith['deepeval_faithfulness'])
    
    correlation_results['faithfulness'] = {
        'pearson': pearson_faith,
        'pearson_p': p_faith,
        'spearman': spearman_faith,
        'spearman_p': sp_faith,
        'n_valid': len(valid_faith)
    }
    
    print(f"Faithfulness ({len(valid_faith)} valid cases):")
    print(f"  Pearson r = {pearson_faith:.4f} (p={p_faith:.4f})")
    print(f"  Spearman œÅ = {spearman_faith:.4f} (p={sp_faith:.4f})")
    print()

# Answer Relevancy correlation
valid_rel = comparison_df[['ragas_answer_relevancy', 'deepeval_answer_relevancy']].dropna()
if len(valid_rel) >= 2:
    pearson_rel, p_rel = pearsonr(valid_rel['ragas_answer_relevancy'], 
                                   valid_rel['deepeval_answer_relevancy'])
    spearman_rel, sp_rel = spearmanr(valid_rel['ragas_answer_relevancy'], 
                                      valid_rel['deepeval_answer_relevancy'])
    
    correlation_results['answer_relevancy'] = {
        'pearson': pearson_rel,
        'pearson_p': p_rel,
        'spearman': spearman_rel,
        'spearman_p': sp_rel,
        'n_valid': len(valid_rel)
    }
    
    print(f"Answer Relevancy ({len(valid_rel)} valid cases):")
    print(f"  Pearson r = {pearson_rel:.4f} (p={p_rel:.4f})")
    print(f"  Spearman œÅ = {spearman_rel:.4f} (p={sp_rel:.4f})")

print("\nüìù Interpretation:")
print("  r > 0.7: Strong agreement")
print("  r 0.4-0.7: Moderate agreement")
print("  r < 0.4: Weak agreement")

In [None]:
# Create scatter plots showing correlation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Faithfulness scatter plot
if len(valid_faith) > 0:
    axes[0].scatter(valid_faith['ragas_faithfulness'], 
                    valid_faith['deepeval_faithfulness'],
                    alpha=0.6, s=100, color='blue')
    axes[0].plot([0, 1], [0, 1], 'r--', label='Perfect agreement', linewidth=2)
    axes[0].set_xlabel('RAGAS Faithfulness', fontsize=12)
    axes[0].set_ylabel('DeepEval Faithfulness', fontsize=12)
    axes[0].set_title(f'Faithfulness: r={pearson_faith:.3f}', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    axes[0].set_xlim(-0.05, 1.05)
    axes[0].set_ylim(-0.05, 1.05)

# Answer Relevancy scatter plot
if len(valid_rel) > 0:
    axes[1].scatter(valid_rel['ragas_answer_relevancy'], 
                    valid_rel['deepeval_answer_relevancy'],
                    alpha=0.6, s=100, color='green')
    axes[1].plot([0, 1], [0, 1], 'r--', label='Perfect agreement', linewidth=2)
    axes[1].set_xlabel('RAGAS Answer Relevancy', fontsize=12)
    axes[1].set_ylabel('DeepEval Answer Relevancy', fontsize=12)
    axes[1].set_title(f'Answer Relevancy: r={pearson_rel:.3f}', fontsize=14, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    axes[1].set_xlim(-0.05, 1.05)
    axes[1].set_ylim(-0.05, 1.05)

plt.tight_layout()
plt.show()

print("‚úÖ Correlation scatter plots generated")

## ‚ö†Ô∏è 6. Identify Disagreement Cases (High Discrepancy)

In [None]:
# Calculate differences
comparison_df['faith_diff'] = abs(comparison_df['ragas_faithfulness'] - comparison_df['deepeval_faithfulness'])
comparison_df['rel_diff'] = abs(comparison_df['ragas_answer_relevancy'] - comparison_df['deepeval_answer_relevancy'])
comparison_df['max_diff'] = comparison_df[['faith_diff', 'rel_diff']].max(axis=1)

# Find high discrepancy cases (difference > 0.3)
threshold = 0.3
high_disc = comparison_df[comparison_df['max_diff'] > threshold].copy()
high_disc = high_disc.sort_values('max_diff', ascending=False)

print(f"‚ö†Ô∏è DISCREPANCY ANALYSIS (threshold > {threshold}):\n")
print(f"  High-discrepancy cases: {len(high_disc)} / {len(comparison_df)}")
print(f"  Percentage: {(len(high_disc)/len(comparison_df)*100):.1f}%\n")

if len(high_disc) > 0:
    print("üìã Top 10 Most Discrepant Cases:\n")
    display_cols = ['testcase_id', 'ragas_faithfulness', 'deepeval_faithfulness', 
                   'faith_diff', 'ragas_answer_relevancy', 'deepeval_answer_relevancy', 
                   'rel_diff', 'max_diff']
    print(high_disc[display_cols].head(10).to_string())
else:
    print("‚úÖ No high-discrepancy cases found!")

In [None]:
# Analyze discrepancy patterns
print("\nüìä DISCREPANCY STATISTICS:\n")
print(f"Max difference observed: {comparison_df['max_diff'].max():.4f}")
print(f"Mean difference: {comparison_df['max_diff'].mean():.4f}")
print(f"Median difference: {comparison_df['max_diff'].median():.4f}")
print(f"Std deviation: {comparison_df['max_diff'].std():.4f}")

# Distribution of differences
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(comparison_df['faith_diff'].dropna(), bins=20, alpha=0.7, color='blue', edgecolor='black')
axes[0].set_title('Distribution of Faithfulness Differences', fontsize=12, fontweight='bold')
axes[0].set_xlabel('|RAGAS - DeepEval|')
axes[0].set_ylabel('Frequency')
axes[0].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold: {threshold}')
axes[0].grid(True, alpha=0.3)
axes[0].legend()

axes[1].hist(comparison_df['rel_diff'].dropna(), bins=20, alpha=0.7, color='green', edgecolor='black')
axes[1].set_title('Distribution of Answer Relevancy Differences', fontsize=12, fontweight='bold')
axes[1].set_xlabel('|RAGAS - DeepEval|')
axes[1].set_ylabel('Frequency')
axes[1].axvline(threshold, color='red', linestyle='--', linewidth=2, label=f'Threshold: {threshold}')
axes[1].grid(True, alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

print("‚úÖ Discrepancy analysis plotted")

## üìù 7. Generate Comprehensive Report

In [None]:
# Create comprehensive report
report = []
report.append("=" * 80)
report.append("META-EVALUATION REPORT: RAGAS vs DeepEval")
report.append("=" * 80)

report.append(f"\nüìä DATASET OVERVIEW:")
report.append(f"  Total testcases analyzed: {len(comparison_df)}")

report.append(f"\nüîó CORRELATION ANALYSIS:")
for metric, stats in correlation_results.items():
    metric_label = metric.replace('_', ' ').title()
    report.append(f"\n  {metric_label}:")
    report.append(f"    Pearson r = {stats['pearson']:.4f} (p={stats['pearson_p']:.4f})")
    report.append(f"    Spearman œÅ = {stats['spearman']:.4f} (p={stats['spearman_p']:.4f})")
    report.append(f"    Valid cases: {stats['n_valid']}")
    
    if stats['pearson'] > 0.7:
        strength = "Strong"
    elif stats['pearson'] > 0.4:
        strength = "Moderate"
    else:
        strength = "Weak"
    report.append(f"    ‚Üí {strength} agreement between frameworks")

report.append(f"\n‚ö†Ô∏è DISCREPANCY ANALYSIS (threshold > {threshold}):")
report.append(f"  High-discrepancy cases: {len(high_disc)} ({(len(high_disc)/len(comparison_df)*100):.1f}%)")

if len(high_disc) > 0:
    report.append(f"\n  Top 5 Most Discrepant Cases:")
    for idx, (_, row) in enumerate(high_disc.head(5).iterrows(), 1):
        report.append(f"\n    {idx}. Testcase: {row['testcase_id']}")
        report.append(f"       Faithfulness: RAGAS={row['ragas_faithfulness']:.3f} vs DeepEval={row['deepeval_faithfulness']:.3f} (diff={row['faith_diff']:.3f})")
        report.append(f"       Answer Relevancy: RAGAS={row['ragas_answer_relevancy']:.3f} vs DeepEval={row['deepeval_answer_relevancy']:.3f} (diff={row['rel_diff']:.3f})")

report.append(f"\nüí° RECOMMENDATIONS:")
report.append(f"  1. Framework Agreement:")
report.append(f"     - {'‚úÖ Both frameworks show strong agreement' if correlation_results.get('faithfulness', {}).get('pearson', 0) > 0.7 else '‚ö†Ô∏è Frameworks show weak to moderate agreement - consider ensemble approach'}")
report.append(f"\n  2. For Production:")
report.append(f"     - Use RAGAS for RAG-specific metrics (Faithfulness, Context Recall)")
report.append(f"     - Use DeepEval for general LLM quality (Relevancy, Toxicity, Bias)")
report.append(f"     - Ensemble approach: average scores when frameworks disagree")
report.append(f"\n  3. Handle Disagreements:")
report.append(f"     - Review {len(high_disc)} high-discrepancy cases manually")
report.append(f"     - Validate with human annotations (A/B testing)")
report.append(f"     - Use disagreement cases as training signal for active learning")
report.append(f"\n  4. Next Steps:")
report.append(f"     - Build ensemble evaluator combining both metrics")
report.append(f"     - Create feedback loop for continuous calibration")
report.append(f"     - Scale evaluation to full production dataset")

report.append("\n" + "=" * 80)

report_text = "\n".join(report)
print(report_text)

In [None]:
# Save report to JSON
report_data = {
    'summary': {
        'n_testcases': len(comparison_df),
        'n_high_discrepancy': len(high_disc),
        'discrepancy_threshold': threshold,
        'high_discrepancy_percentage': round(len(high_disc) / len(comparison_df) * 100, 2)
    },
    'correlations': {
        metric: {
            'pearson': stats.get('pearson'),
            'pearson_p': stats.get('pearson_p'),
            'spearman': stats.get('spearman'),
            'spearman_p': stats.get('spearman_p'),
            'n_valid': stats.get('n_valid')
        }
        for metric, stats in correlation_results.items()
    },
    'high_discrepancy_cases': high_disc[[
        'testcase_id',
        'ragas_faithfulness',
        'deepeval_faithfulness',
        'faith_diff',
        'ragas_answer_relevancy',
        'deepeval_answer_relevancy',
        'rel_diff'
    ]].to_dict('records'),
    'recommendations': [
        "Use RAGAS for RAG-specific metrics",
        "Use DeepEval for general LLM quality",
        "Ensemble approach: average scores when frameworks disagree",
        "Review high-discrepancy cases manually",
        "Validate with human annotations"
    ]
}

# Save to file
output_path = Path('../data/open_rag_eval_results_raptor/meta_comparison_report.json')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(report_data, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Report saved to {output_path}")

## ‚úÖ Summary

**Ph·∫ßn meta-evaluation ƒë√£ complete:**
- ‚úÖ Loaded RAGAS & DeepEval results
- ‚úÖ Computed Pearson & Spearman correlations
- ‚úÖ Found high-discrepancy cases
- ‚úÖ Generated visualizations (scatter plots, distributions, difference histograms)
- ‚úÖ Created comprehensive report with recommendations

**K·∫øt qu·∫£ ƒë∆∞·ª£c l∆∞u:**
- JSON report: `data/open_rag_eval_results_raptor/meta_comparison_report.json`

**B∆∞·ªõc ti·∫øp theo:**
1. Review high-discrepancy cases manually
2. Validate v·ªõi human annotations
3. Build ensemble evaluator n·∫øu c·∫ßn
4. Scale evaluation t·ªõi full dataset