In [None]:
import lm_eval
import pandas as pd

from lm_eval.tasks import TaskManager
from lm_eval.models.huggingface import HFLM
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
task_manager = TaskManager(include_path="../data/elections", include_defaults=False)
print(task_manager.list_all_tasks(list_groups=False, list_tags=False))

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_safetensors=True)
model = AutoModelForCausalLM.from_pretrained(
            model_id,
            use_safetensors=True,
            device_map="cuda"
        )

# lm_eval wrapper
model = HFLM(model, tokenizer=tokenizer)

In [None]:
results = lm_eval.simple_evaluate(
    model=model,
    tasks="us-elections",
    task_manager=task_manager)

In [91]:
state_results = {}

for record in results["samples"]["us-elections"]:
    state = record["doc"]["state"]
    
    continuations = [cont.strip() for _, cont in record["arguments"]]
    lls = [ll for (ll, _), *_ in record["resps"]]
    data = dict(zip(continuations, lls))
    
    state_results[state] = pd.Series(data=lls, index=continuations)

In [None]:
pd.concat(objs=state_results.values(), keys=state_results.keys()).unstack()