# Lab 5 â€” Evaluating LLMs with Ollama

In [None]:
import json
from pathlib import Path

import numpy as np
from ollama import Client

from eval_utils import run_evaluation

In [None]:
np.random.seed(42)

In [None]:
# Set up paths
ROOT = Path(".").resolve()
LAB_DIR = ROOT
OUTPUT_DIR = LAB_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
TASKS_DIR = LAB_DIR / 'tasks'
TASKS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# Create a client
client = Client()
models_resp = client.list()


In [None]:
# Load per-task JSON files from `tasks/`
task_files = sorted(TASKS_DIR.glob('*.json'))

TASKS = []
DEV_EXAMPLES = {}
EVAL_EXAMPLES = {}

for f in task_files:
    d = json.loads(f.read_text(encoding='utf-8'))
    TASKS.append({'id': d['id'], 'name': d['name'], 'description': d.get('description',''), 'eval_criteria': d.get('eval_criteria','')})
    DEV_EXAMPLES[d['id']] = d.get('dev_examples', [])
    EVAL_EXAMPLES[d['id']] = d.get('eval_example', {})

In [None]:
# Define the models and strategies for the experiment
selected_models = [
    # Small model
    'smollm:1.7b',
    # Large reasoning model
    'magistral:24b'
]
strategies = ['zero', 'few', 'cot']

# Execute the evaluation
results = run_evaluation(
    models=selected_models,
    strategies=strategies,
    tasks=TASKS,
    examples=EVAL_EXAMPLES,
    client=client,
    output_dir=OUTPUT_DIR,
    save_prefix='lab5_experiment',
    dev_examples=DEV_EXAMPLES,
    tasks_dir=TASKS_DIR
)