In [None]:
from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Set up paths
ROOT = Path(".").resolve()
OUTPUT_DIR = ROOT / "outputs"
ANNOTATIONS_DIR = ROOT / "annotations"

In [None]:
# Load latest annotation file
annotation_files = sorted(ANNOTATIONS_DIR.glob('annotations_lab5_experiment_*.csv'))
latest_anno = annotation_files[-1]
df = pd.read_csv(latest_anno)

# Filter out un-scored rows if any
df = df[df['score'].notna()].copy()
df['score'] = df['score'].astype(float)
df['normalized_score'] = df['score'] / 5.0

In [None]:
# Aggregate metrics
agg_metrics = df.groupby(['task_id', 'task_name', 'model', 'strategy']).agg(
    avg_score=('normalized_score', 'mean'),
    n=('score', 'size')
).reset_index()

In [None]:
# Visualizing the impact of prompting strategies
plt.figure(figsize=(12, 6))
sns.barplot(data=agg_metrics, x='strategy', y='avg_score', hue='model')
plt.title('Impact of Prompting Strategy (Manual Score) across Models')
plt.ylabel('Average Score (Normalized 0-1)')
plt.xlabel('Prompting Strategy')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "strategy_comparison_manual.png")
plt.show()

In [None]:
# Performance by Task Type
plt.figure(figsize=(14, 7))
sns.barplot(data=agg_metrics, x='task_name', y='avg_score', hue='model')
plt.title('Model Performance by Task Type (Manual Score)')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Average Score (Normalized 0-1)')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "task_performance_manual.png")
plt.show()