In [None]:
import re
import random
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import time
import pynvml

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
openbookqa_test = load_dataset("allenai/openbookqa", "main", split="test")
arc_easy_test = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
arc_chal_test = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")

len(openbookqa_test), len(arc_easy_test), len(arc_chal_test)



(500, 2376, 1172)

In [4]:
def norm_openbookqa(ex):
    # ex["choices"] has "label" and "text"
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return {
        "question": ex["question_stem"],
        "labels": labels,
        "choices": choices,
        "answer_key": answer_key
    }

def norm_arc(ex):
    # ex["choices"] has "label" and "text" as lists
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return {
        "question": ex["question"],
        "labels": labels,
        "choices": choices,
        "answer_key": answer_key
    }

In [5]:
CANON = ["A", "B", "C", "D", "E", "F"]

def canonicalize_labels(question, labels, choices, answer_key):
    # map from original label -> canonical label by position
    mapping = {orig: CANON[i] for i, orig in enumerate(labels)}
    canon_labels = [mapping[l] for l in labels]
    canon_answer = mapping[answer_key]
    return {
        "question": question,
        "labels": canon_labels,
        "choices": choices,
        "answer_key": canon_answer,
    }

def norm_openbookqa_canon(ex):
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return canonicalize_labels(ex["question_stem"], labels, choices, answer_key)

def norm_arc_canon(ex):
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return canonicalize_labels(ex["question"], labels, choices, answer_key)

openbookqa_norm = openbookqa_test.map(norm_openbookqa_canon, remove_columns=openbookqa_test.column_names)
arc_easy_norm = arc_easy_test.map(norm_arc_canon, remove_columns=arc_easy_test.column_names)
arc_chal_norm = arc_chal_test.map(norm_arc_canon, remove_columns=arc_chal_test.column_names)

openbookqa_norm[0], arc_easy_norm[0], arc_chal_norm[0]

({'choices': ['make more phone calls',
   'quit eating lunch out',
   'buy less with monopoly money',
   'have lunch with friends'],
  'question': 'A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to',
  'labels': ['A', 'B', 'C', 'D'],
  'answer_key': 'B'},
 {'question': 'Which statement best explains why photosynthesis is the foundation of most food webs?',
  'choices': ['Sunlight is the source of energy for nearly all ecosystems.',
   'Most ecosystems are found on land instead of in water.',
   'Carbon dioxide is more available than other gases.',
   'The producers in all ecosystems are plants.'],
  'labels': ['A', 'B', 'C', 'D'],
  'answer_key': 'A'},
 {'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
  'choices': ['Planetary density will de

In [3]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
#tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    quantization_config=bnb_config,
)
model.eval()
if device == "cpu":
    model = model.to(device)



Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

In [4]:
torch.cuda.synchronize()
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

Allocated: 1112.93 MB


In [7]:
def build_prompt(question, labels, choices):
    # Ensure consistent ordering
    opts = "\n".join([f"{l}. {c}" for l, c in zip(labels, choices)])
    user = (
        "Answer the multiple-choice question.\n"
        "Reply with exactly one line in this format:\n"
        "Final answer: <LETTER>\n\n"
        f"Question: {question}\n"
        f"Choices:\n{opts}\n"
    )
    messages = [
        {"role": "system", "content": "You are a helpful assistant that follows the format exactly."},
        {"role": "user", "content": user},
    ]
    # Qwen2.5-Instruct supports chat template
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


In [8]:
@torch.no_grad()
def generate_text(prompt, max_new_tokens=10):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,          # deterministic
        temperature=1.0,
        top_p=1.0,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # Return only the tail after the prompt where possible
    return text

def sample_and_print(ds, n=10, name="dataset"):
    idxs = random.sample(range(len(ds)), n)
    print(f"=== {name}: {n} samples ===")
    for i, idx in enumerate(idxs, 1):
        ex = ds[idx]
        prompt = build_prompt(ex["question"], ex["labels"], ex["choices"])
        full = generate_text(prompt)
        print(f"\n--- Sample {i} (idx={idx}) ---")
        print("GOLD:", ex["answer_key"])
        print(full[-800:])  # print last chunk (includes model answer)

sample_and_print(openbookqa_norm, n=10, name="openbookqa_test")

=== openbookqa_test: 10 samples ===

--- Sample 1 (idx=327) ---
GOLD: D
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: In the hottest months in the hottest desert, creatures such as birds may find water to drink
Choices:
A. in sticks
B. in pebbles
C. in sand
D. in spiked plants

assistant
Final answer: C. in sand

--- Sample 2 (idx=57) ---
GOLD: D
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: A construction group wants to put a shopping center in town, but the only place available is a small nature park with a trail. Deer and other wildlife frequent the park, since it is the only place in the city where trees and fresh water are available for them. The construction group decides to build the shopping center,

In [9]:
sample_and_print(arc_easy_norm, n=10, name="arc_easy_test")

=== arc_easy_test: 10 samples ===

--- Sample 1 (idx=2233) ---
GOLD: C
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: A student has decided to investigate whether the number of flowers on a plant will increase if the water supply to the plant is increased. She has five pots of geranium plants to use in her experiment. What factor in the experiment should be varied for the five plants in order to answer the student's question?
Choices:
A. age of seedlings
B. temperature of water
C. volume of water
D. number of hours in sunlight

assistant
Final answer: C. Volume of water

--- Sample 2 (idx=356) ---
GOLD: A
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: Eating food that is undercooked can lead to a disease that

In [10]:
sample_and_print(arc_chal_norm, n=10, name="arc_challenge_test")

=== arc_challenge_test: 10 samples ===

--- Sample 1 (idx=1149) ---
GOLD: C
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: A hot rock is dropped into a pail of cool water. Heat energy transferred from the rock to the water by
Choices:
A. boiling.
B. evaporation.
C. conduction.
D. radiation.

assistant
Final answer: C. conduction.

--- Sample 2 (idx=407) ---
GOLD: D
system
You are a helpful assistant that follows the format exactly.
user
Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: A science student notices that the contents of one of the test tubes in an experiment are bubbling and changing colors. Before completing the experiment, the student records that the substance in this particular test tube is undergoing a chemical change. The student's statement is an example of
Choices

In [11]:
ANSWER_RE = re.compile(r"\b([A-D])\b")

def extract_letter(text, valid_labels={"A","B","C","D"}):
    matches = [m.group(1) for m in ANSWER_RE.finditer(text)]
    for label in reversed(matches):
        if label in valid_labels:
            return label
    return None


In [12]:
def compliance_rate(ds, k=200):
    idxs = random.sample(range(len(ds)), min(k, len(ds)))
    ok = 0
    for idx in idxs:
        ex = ds[idx]
        prompt = build_prompt(ex["question"], ex["labels"], ex["choices"])
        out = generate_text(prompt)
        pred = extract_letter(out, set(ex["labels"]))
        ok += int(pred is not None)
        if(pred is None):
            print(out)
    return ok / len(idxs)

print("openbookqa compliance:", compliance_rate(openbookqa_norm, 200))
print("arc_easy compliance:", compliance_rate(arc_easy_norm, 200))
print("arc_chal compliance:", compliance_rate(arc_chal_norm, 200))

openbookqa compliance: 1.0
arc_easy compliance: 1.0
arc_chal compliance: 1.0


In [13]:
pynvml.nvmlInit()
HANDLE = pynvml.nvmlDeviceGetHandleByIndex(0)

def sample_power(handle):
    # returns power in Watts
    return pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0

def evaluate(ds, sample_dt=0.1):
    correct = 0
    total = len(ds)

    power_samples = []
    t_start = time.time()
    last_sample = t_start

    for ex in ds:
        now = time.time()
        if now - last_sample >= sample_dt:
            power_samples.append(sample_power(HANDLE))
            last_sample = now

        prompt = build_prompt(ex["question"], ex["labels"], ex["choices"])
        output = generate_text(prompt)
        pred = extract_letter(output)

        if pred == None:
            print(output)
        if pred == ex["answer_key"]:
            correct += 1

    t_end = time.time()

    # --- metrics ---
    duration = t_end - t_start
    avg_power = sum(power_samples) / len(power_samples)
    energy_joules = avg_power * duration

    return {
        "accuracy": correct / total,
        "time_sec": duration,
        "avg_power_w": avg_power,
        "energy_j": energy_joules,
    }


In [14]:
acc_obqa = evaluate(openbookqa_norm)
acc_obqa

{'accuracy': 0.71,
 'time_sec': 113.80131030082703,
 'avg_power_w': 63.08438476953908,
 'energy_j': 7179.085646295083}

In [15]:
acc_arc_easy = evaluate(arc_easy_norm)
acc_arc_easy

{'accuracy': 0.851010101010101,
 'time_sec': 555.2197864055634,
 'avg_power_w': 64.26668030366933,
 'energy_j': 35682.13251119791}

In [16]:
acc_arc_challenge = evaluate(arc_chal_norm)
acc_arc_challenge

{'accuracy': 0.6766211604095563,
 'time_sec': 273.0134959220886,
 'avg_power_w': 64.8860247863248,
 'energy_j': 17714.760463401824}

In [None]:
!lm_eval \
  --model hf \
  --model_args pretrained=Qwen/Qwen2.5-1.5B-Instruct,dtype=auto,device_map=auto \
  --tasks openbookqa,arc_easy,arc_challenge \
  --batch_size auto \
  --num_fewshot 0


|    Tasks    |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|-------------|------:|------|-----:|--------|---|-----:|---|-----:|
|arc_challenge|      1|none  |     0|acc     |↑  |0.4352|±  |0.0145|
|             |       |none  |     0|acc_norm|↑  |0.4676|±  |0.0146|
|arc_easy     |      1|none  |     0|acc     |↑  |0.7664|±  |0.0087|
|             |       |none  |     0|acc_norm|↑  |0.7614|±  |0.0087|
|openbookqa   |      1|none  |     0|acc     |↑  |0.3160|±  |0.0208|
|             |       |none  |     0|acc_norm|↑  |0.4080|±  |0.0220|