# Lab 5 â€” Evaluating LLMs with Ollama

In [None]:
import time
import json
import re
from pathlib import Path
from datetime import datetime, timezone

import pandas as pd
import numpy as np
from tqdm import trange, tqdm
from ollama import Client
import matplotlib.pyplot as plt
import seaborn as sns

from eval_utils import (
    validate_models,
    should_skip_cot,
    preview_experiments,
    zero_shot_prompt,
    few_shot_prompt,
    cot_prompt,
    exact_match,
    query_chat,
    query_generate,
    save_prompt_run,
    run_evaluation,
    token_overlap,
    compute_metrics,
    get_judge_score,
    run_judge_evaluation,
)

In [None]:
np.random.seed(42)

In [None]:
# Set up paths
ROOT = Path(".").resolve()
LAB_DIR = ROOT
OUTPUT_DIR = LAB_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TASKS_DIR = LAB_DIR / 'tasks'
TASKS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Create a client (defaults to http://localhost:11434)
client = Client()
models_resp = client.list()

print("Available models:")
for m in models_resp.models:
    print(" -", m.model)


In [None]:
# Load per-task JSON files from `lab5/tasks/` (no default task definitions in notebook)
task_files = sorted(TASKS_DIR.glob('*.json'))

# Load tasks
TASKS = []
DEV_EXAMPLES = {}
EVAL_EXAMPLES = {}

for f in task_files:
    d = json.loads(f.read_text(encoding='utf-8'))
    TASKS.append({'id': d['id'], 'name': d['name'], 'description': d.get('description',''), 'eval_criteria': d.get('eval_criteria','')})
    DEV_EXAMPLES[d['id']] = d.get('dev_examples', [])
    EVAL_EXAMPLES[d['id']] = d.get('eval_example', {})

print(f"Loaded {len(TASKS)} tasks.")

In [None]:
# Runtime querying helpers are provided by `lab5/eval_utils.py` and imported into the notebook.
# Functions available: `query_chat`, `query_generate`

In [None]:
# Runtime experiment loop helpers are provided by `lab5/eval_utils.py` and imported into the notebook.
# Use `run_evaluation(models, strategies, tasks, examples, client, output_dir)` to run experiments and persist outputs.

In [None]:
# Metrics helpers are provided by `lab5/eval_utils.py` and imported into the notebook.
# Functions available: `token_overlap`, `compute_metrics`

In [None]:
# Define the models and strategies for the experiment
selected_models = [
    'SpeakLeash/bielik-1.5b-v3.0-instruct:Q8_0',
    'ministral-3:3b',
    'deepseek-r1:7b'
]
strategies = ['zero', 'few', 'cot']

# Execute the evaluation
# This will iterate through 10 tasks, 3 models, and up to 3 strategies per model.
# Total expected runs: 10 * (2*3 + 1*2) = 10 * 8 = 80 runs.
results = run_evaluation(
    models=selected_models,
    strategies=strategies,
    tasks=TASKS,
    examples=EVAL_EXAMPLES,
    client=client,
    output_dir=OUTPUT_DIR,
    save_prefix='lab5_experiment',
    dev_examples=DEV_EXAMPLES,
    tasks_dir=TASKS_DIR
)

# Compute and display metrics
# `compute_metrics` will add 'expected', 'exact_match' and 'overlap_score' and return aggregated metrics
df_results, agg_metrics = compute_metrics(results, EVAL_EXAMPLES)

print("\n--- Evaluation Summary ---")
# Pivot the results for a better view: Models as columns, Tasks as rows
summary_pivot = agg_metrics.pivot_table(
    index=['task_id', 'task_name'], 
    columns=['model', 'strategy'], 
    values='accuracy'
)
display(summary_pivot)

# Save the final aggregated metrics to a file for the report
agg_metrics.to_csv(OUTPUT_DIR / "final_experiment_results.csv", index=False)
print(f"\nFinal results saved to {OUTPUT_DIR / 'final_experiment_results.csv'}")

### LLM-as-a-Judge Evaluation
Since `exact_match` is too strict for open-ended tasks, we use a reasoning model (`deepseek-r1:7b`) to evaluate the quality of the responses based on the task criteria. 

The judge will assign a score from 0 to 5. We then normalize this to a 0-1 scale.

In [None]:
# Judge-evaluation helpers are provided by `lab5/eval_utils.py` and imported into the notebook.
# Use `get_judge_score(task, prompt, response, client)` and `run_judge_evaluation(results_df, tasks, client)`

In [None]:
# Visualizing the impact of prompting strategies using Judge Scores
plt.figure(figsize=(12, 6))
sns.barplot(data=agg_metrics, x='strategy', y='avg_judge_score', hue='model')
plt.title('Impact of Prompting Strategy (Judge Score) across Models')
plt.ylabel('Average Judge Score (Normalized 0-1)')
plt.xlabel('Prompting Strategy')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "strategy_comparison_judge.png")
plt.show()

# Performance by Task Type using Judge Scores
plt.figure(figsize=(14, 7))
sns.barplot(data=agg_metrics, x='task_name', y='avg_judge_score', hue='model')
plt.title('Model Performance by Task Type (Judge Score)')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Average Judge Score (Normalized 0-1)')
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_DIR / "task_performance_judge.png")
plt.show()

# Distribution Comparison
plt.figure(figsize=(10, 5))
melted_metrics = agg_metrics.melt(id_vars=['model'], value_vars=['avg_accuracy', 'avg_judge_score'], 
                                   var_name='Metric', value_name='Score')
sns.boxplot(data=melted_metrics, x='Metric', y='Score')
plt.title('Exact Match Accuracy vs. LLM Judge Score')
plt.show()