In [None]:
import copy
import json
from tqdm.notebook import tqdm

from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
)

import anthropic
from openai import OpenAI

In [None]:
BENCHMARK_FILE = "benchmark.jsonl"

In [None]:
PROMPT_TEMPLATE = """Eres un abogado colombiano y se te realiza la siguiente pregunta. Si no conoces la respuesta, adivina.

Situación: {}

Pregunta: {}

Opciones:
a. {}

b. {}

c. {}

d. {}

Responde solo con la letra correspondiente: a, b, c o d.

Respuesta:"""

In [None]:
PROMPT_TEMPLATE_CHAT = [
    {
        "role": "user",
        "content": """Eres un abogado colombiano y se te realiza la siguiente pregunta. Si no conoces la respuesta, adivina.

Situación: {}

Pregunta: {}

Opciones:
a. {}

b. {}

c. {}

d. {}

Responde solo con la letra correspondiente: a, b, c o d."""}]

In [None]:
modality = "chat"  # chat, None
backend = "anthropic"  # hf, openai, anthropic
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
model_id = "claude-opus-4-1-20250805"  # gpt-5, gpt-4o, claude-opus-4-1-20250805

## Load the data

In [None]:
def load_benchmark(path):
    with open(path) as f:
        return [json.loads(l) for l in f]

In [None]:
ds = Dataset.from_list(load_benchmark(BENCHMARK_FILE))

In [None]:
ds

## Load the model / client

In [None]:
if backend == "openai":
    client = OpenAI()
elif backend == "anthropic":
    client = anthropic.Anthropic()
elif backend == "hf":
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map=model.device,
        max_new_tokens=1,
        do_sample=False,
        temperature=0.0
    )

## Run the evaluation

In [None]:
def format_prompt(situation, question, choices):
    if modality == "chat":
        template = copy.deepcopy(PROMPT_TEMPLATE_CHAT)
        template[0]["content"] = template[0]["content"].format(
            situation, question,
            choices[0], choices[1], choices[2], choices[3]
        )
        return template
    else:
        return PROMPT_TEMPLATE.format(
            situation, question,
            choices[0], choices[1], choices[2], choices[3]
        )

In [None]:
@torch.no_grad()
def generate(prompt):
    input = tokenizer(prompt, return_tensors="pt").to(model.device)
    logits = model(**input).logits[:, -1, :]
    return tokenizer.decode(logits.argmax()).strip().lower()

In [None]:
def chat(prompt):
    if backend == "openai":
        return client.responses.create(
            model=model_id,
            input=prompt,
            # Required for non-reasoning models
            max_output_tokens=16,
            temperature=0,
            # Optional for reasoning models
            # reasoning={
            #     "effort": "minimal"
            # },
        ).output_text
    if backend == "anthropic":
        return client.messages.create(
            model=model_id,
            messages=prompt,
            max_tokens=1,
            temperature=0,
        ).content[0].text
    elif backend == "hf":
        return pipe(prompt)[0]["generated_text"][-1]["content"]  # type: ignore

In [None]:
accuracy = {"overall": 0}
n_instances = {"overall": len(ds)}


for ex in tqdm(ds):
    prompt = format_prompt(ex["situacion"], ex["enunciado"], ex["opciones"])  # type: ignore

    if modality == "chat":
        output = chat(prompt)
        if len(output) > 0:
            output = output[0]
    else:
        output = generate(prompt)

    group = ex["competencia"]  # type: ignore
    correct = "abcd"[ex["correcta"]]  # type: ignore
    result = int(output == correct)
    if group in accuracy:
        accuracy[group] += result
        n_instances[group] += 1
    else:
        accuracy[group] = result
        n_instances[group] = 1
    accuracy["overall"] += result

for group in accuracy.keys():
    print(f"Accuracy ({group}): {accuracy[group] / n_instances[group]:.4f}")

## Persist the results

In [None]:
def save_results(results, model_id):
    with open(f"./results/{model_id}.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

In [None]:
results = {}
for group in accuracy.keys():
    results[group] = accuracy[group] / n_instances[group]
save_results(results, model_id.split("/")[-1])