# Lab 5 â€” Evaluating LLMs with Ollama

In [None]:
import json
from pathlib import Path

import numpy as np
from ollama import Client
import matplotlib.pyplot as plt
import seaborn as sns

from eval_utils import (
    run_evaluation,
    compute_metrics,
    run_judge_evaluation,
)

In [None]:
np.random.seed(42)

In [None]:
# Set up paths
ROOT = Path(".").resolve()
LAB_DIR = ROOT
OUTPUT_DIR = LAB_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TASKS_DIR = LAB_DIR / 'tasks'
TASKS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Create a client (defaults to http://localhost:11434)
client = Client()
models_resp = client.list()

print("Available models:")
for m in models_resp.models:
    print(" -", m.model)


In [None]:
# Load per-task JSON files from `tasks/`
task_files = sorted(TASKS_DIR.glob('*.json'))

# Load tasks
TASKS = []
DEV_EXAMPLES = {}
EVAL_EXAMPLES = {}

for f in task_files:
    d = json.loads(f.read_text(encoding='utf-8'))
    TASKS.append({'id': d['id'], 'name': d['name'], 'description': d.get('description',''), 'eval_criteria': d.get('eval_criteria','')})
    DEV_EXAMPLES[d['id']] = d.get('dev_examples', [])
    EVAL_EXAMPLES[d['id']] = d.get('eval_example', {})

print(f"Loaded {len(TASKS)} tasks.")

In [None]:
# Define the models and strategies for the experiment
selected_models = [
    # Small model
    'smollm:1.7b',
    # Large model with reasoning capabilities
    'magistral:24b'
]
strategies = ['zero', 'few', 'cot']

# Execute the evaluation
results = run_evaluation(
    models=selected_models,
    strategies=strategies,
    tasks=TASKS,
    examples=EVAL_EXAMPLES,
    client=client,
    output_dir=OUTPUT_DIR,
    save_prefix='lab5_experiment',
    dev_examples=DEV_EXAMPLES,
    tasks_dir=TASKS_DIR
)
# Compute and display metrics
df_results, agg_metrics = compute_metrics(results, EVAL_EXAMPLES)

print("\nEvaluation Summary:")
# Pivot the results for a better view: Models as columns, Tasks as rows
summary_pivot = agg_metrics.pivot_table(
    index=['task_id', 'task_name'], 
    columns=['model', 'strategy'],
    values='accuracy'
)
display(summary_pivot)

# Save the final aggregated metrics to a file for the report
agg_metrics.to_csv(OUTPUT_DIR / "final_experiment_results.csv", index=False)
print(f"\nFinal results saved to {OUTPUT_DIR / 'final_experiment_results.csv'}")

In [None]:
# Ensure judge scores are present and aggregate them for plotting
if 'normalized_judge_score' not in df_results.columns:
    print("No judge scores found. Running judge evaluation (may be slow)...")
    # You can change the judge_model or run this on a subset to save time
    df_results = run_judge_evaluation(df_results, TASKS, client, judge_model='deepseek-r1:7b')
else:
    print("Using existing judge scores")

# Aggregate judge scores (normalized 0-1) per task/model/strategy
agg_judge = df_results.groupby(['task_id', 'task_name', 'model', 'strategy']).agg(
    avg_judge_score=('normalized_judge_score', 'mean'),
    n_judge=('normalized_judge_score', 'size')
).reset_index()

# Merge with existing aggregated metrics
agg_metrics = agg_metrics.merge(
    agg_judge[['task_id', 'model', 'strategy', 'avg_judge_score']],
    on=['task_id', 'model', 'strategy'],
    how='left'
)

# Ensure column names expected by plotting cells exist
agg_metrics['avg_judge_score'] = agg_metrics['avg_judge_score'].fillna(0.0)
agg_metrics['avg_accuracy'] = agg_metrics.get('accuracy', agg_metrics.get('avg_accuracy', None))
if agg_metrics['avg_accuracy'].isnull().any():
    agg_metrics['avg_accuracy'] = agg_metrics['accuracy']

print('Aggregated judge scores added to `agg_metrics`.')

In [None]:
# Visualizing the impact of prompting strategies using Judge Scores
plt.figure(figsize=(12, 6))
sns.barplot(data=agg_metrics, x='strategy', y='avg_judge_score', hue='model')
plt.title('Impact of Prompting Strategy (Judge Score) across Models')
plt.ylabel('Average Judge Score (Normalized 0-1)')
plt.xlabel('Prompting Strategy')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "strategy_comparison_judge.png")
plt.show()

# Performance by Task Type using Judge Scores
plt.figure(figsize=(14, 7))
sns.barplot(data=agg_metrics, x='task_name', y='avg_judge_score', hue='model')
plt.title('Model Performance by Task Type (Judge Score)')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Average Judge Score (Normalized 0-1)')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "task_performance_judge.png")
plt.show()

# Distribution Comparison
plt.figure(figsize=(10, 5))
melted_metrics = agg_metrics.melt(id_vars=['model'], value_vars=['avg_accuracy', 'avg_judge_score'], 
                                   var_name='Metric', value_name='Score')
sns.boxplot(data=melted_metrics, x='Metric', y='Score')
plt.title('Exact Match Accuracy vs. LLM Judge Score')
plt.show()