# Welcome to the Hypnos Agent Model Evaluation Notebook 🔍

<img src="https://raw.githubusercontent.com/dmitrykazhdan/HYPNOS/refs/heads/main/assets/hypnos_icon.png" alt="Icon" width="100"/>  

## Installation 🔧



Mount to your Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Specify whether you have GPUs enabled (Recommended)

In [None]:
!pip install torch

In [None]:
import torch
gpu_acceleration = torch.cuda.is_available()
print(gpu_acceleration)

In [None]:
if gpu_acceleration:
  # Required for CMake + CUDA builds
  !apt-get -qq install -y cmake build-essential

  # Confirm GPU is visible
  !nvidia-smi

In [None]:
!pip install llama-cpp-python

In [None]:
# Check GPU-enablement
from llama_cpp import Llama
print("✅ GPU-enabled build!" if "n_gpu_layers" in Llama.__init__.__code__.co_varnames else "❌ CPU-only build!")

In [None]:
!pip install evaluate rouge rouge_score

## Evaluation 🔍

In [None]:
# Specify models to load, and the data file path
drive_root = "..."

GGUF_MODEL_NAMES_TO_PATH_DICT = [
    ("Baseline",    f"{drive_root}/...gguf"),
    ("SFT_Quantized",   f"{drive_root}/...gguf"),
    ("DPO_Quantized",   f"{drive_root}/...gguf"),
]

TEST_JSON = f"{drive_root}/data/sleep-test-enriched-cleaned.json"

# Set to >0 for subsetting a portion of data
# Set to <= 0 for using all data
SUBSET = 0

Define helper functions for model loading

In [None]:
from llama_cpp import Llama
import evaluate, json, numpy as np, torch, gc, time, os


def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    time.sleep(0.3)

def prompt(q):
    return (
        f"<bos><start_of_turn>user\n{q} (Respond in one sentence)"
        "<end_of_turn>\n<start_of_turn>model\n"
    )

def load_gguf(path):
    return Llama(
        model_path=path,
        n_ctx=512,
        n_threads=os.cpu_count() or 4,
        verbose=False,
        n_gpu_layers=20 if gpu_acceleration else 0
    )

def generate(llm, qs):
    outs = []
    for i, q in enumerate(qs):
        prompt_text = prompt(q)
        out = llm(prompt_text, max_tokens=128, temperature=0.0, stop=["<end_of_turn>"])
        text = out["choices"][0]["text"].strip()
        print(f"   sample {i+1}: {text[:120]}")
        outs.append(text)
    return outs

Define helper functions for prediction evaluation using ROUGE-L scoring

In [None]:
rouge = evaluate.load("rouge")
def rouge_l(preds, refs):
  return rouge.compute(predictions=preds, references=refs)["rougeL"]

def rouge_ci(p, r, boot=1000, conf=0.95):
    scores = []
    n = len(p)
    for _ in range(boot):
        idx = np.random.choice(n, n, replace=True)
        scores.append(rouge_l([p[i] for i in idx], [r[i] for i in idx]))
    lo, hi = np.percentile(scores, [(1 - conf) * 50, 100 - (1 - conf) * 50])
    return lo, hi

You're all set to run the evaluation... 🚀

Note: anticipated ROUGE score on SleeQA enriched is ~0.25-0.3


In [None]:

def main():
    print("\n🧪 GGUF Model Evaluation\n" + "─"*50)
    data = json.load(open(TEST_JSON))
    items = [(d["question"], d["answer"]) for d in data]
    if SUBSET > 0: items = items[:SUBSET]
    Qs, Rs = zip(*items)
    print(f"{len(Qs)} test questions\n")

    for label, path in GGUF_MODEL_NAMES_TO_PATH_DICT:
        cleanup()
        llm = load_gguf(path)
        preds = generate(llm, Qs)
        score = rouge_l(preds, Rs)
        lo, hi = rouge_ci(preds, Rs, boot=500)
        avg_len = np.mean([len(p.split()) for p in preds])
        print(f"\n{label:<12} ROUGE‑L {score:.4f} "
              f"[{lo:.4f}, {hi:.4f}]  len={avg_len:.1f}\n")

if __name__ == "__main__":
    main()