# Evaluation Results Analysis

Analyze and visualize LLM evaluation results from the harness.

In [None]:
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path().absolute().parent / 'src'))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from llm_eval.metrics.storage import MetricsStorage
from llm_eval.metrics.exporters import CSVExporter

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Recent Runs

In [None]:
# Initialize storage
storage = MetricsStorage('../data/metrics/eval_results.db')

# List recent runs
runs = storage.list_runs(limit=20)

# Convert to DataFrame
runs_df = pd.DataFrame(runs)
runs_df['timestamp'] = pd.to_datetime(runs_df['timestamp'])

print(f"Loaded {len(runs_df)} runs")
runs_df.head()

## Aggregate Metrics by Model

In [None]:
# Extract aggregate metrics
metrics_rows = []
for _, row in runs_df.iterrows():
    metrics = row['aggregate_metrics']
    metrics['run_id'] = row['run_id']
    metrics['model'] = row['model_name']
    metrics['task'] = row['task_name']
    metrics['timestamp'] = row['timestamp']
    metrics_rows.append(metrics)

metrics_df = pd.DataFrame(metrics_rows)
metrics_df.head()

## Visualize Accuracy by Model

In [None]:
# Filter for accuracy metric
if 'accuracy' in metrics_df.columns:
    plt.figure(figsize=(12, 6))
    
    # Bar plot by model
    model_accuracy = metrics_df.groupby('model')['accuracy'].mean().sort_values(ascending=False)
    model_accuracy.plot(kind='bar')
    
    plt.title('Average Accuracy by Model', fontsize=14)
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No accuracy metric found")

## Model Comparison (JSON Task)

In [None]:
# Filter JSON task results
json_results = metrics_df[metrics_df['task'] == 'json']

if len(json_results) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Accuracy comparison
    if 'accuracy' in json_results.columns:
        json_results.groupby('model')['accuracy'].mean().plot(kind='bar', ax=axes[0])
        axes[0].set_title('JSON Extraction Accuracy')
        axes[0].set_ylabel('Accuracy')
        axes[0].set_ylim(0, 1)
    
    # Parse rate comparison
    if 'parse_rate' in json_results.columns:
        json_results.groupby('model')['parse_rate'].mean().plot(kind='bar', ax=axes[1])
        axes[1].set_title('JSON Parse Success Rate')
        axes[1].set_ylabel('Parse Rate')
        axes[1].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()
else:
    print("No JSON task results found")

## Export Data for Further Analysis

In [None]:
# Export summary to CSV
exporter = CSVExporter(storage)

# Export summary
num_exported = exporter.export_summary('../data/exports/summary.csv', limit=20)
print(f"Exported {num_exported} runs to summary.csv")

# Export individual run details (optional)
# if len(runs) > 0:
#     latest_run = runs[0]['run_id']
#     exporter.export_run(latest_run, f'../data/exports/{latest_run}.csv')
#     print(f"Exported detailed results for {latest_run}")

## Sample-Level Analysis

In [None]:
# Load sample-level results for a specific run
if len(runs) > 0:
    latest_run_id = runs[0]['run_id']
    samples = storage.get_sample_results(latest_run_id)
    
    samples_df = pd.DataFrame(samples)
    
    print(f"Latest run: {latest_run_id}")
    print(f"Total samples: {len(samples_df)}")
    
    # Extract scores
    scores_df = pd.DataFrame(samples_df['scores'].tolist())
    
    print("\nScore distributions:")
    print(scores_df.describe())
else:
    print("No runs available for sample analysis")