In [None]:
import re
import random
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback, BitsAndBytesConfig
import torch
import time
import pynvml
from peft import PeftModel

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
openbookqa_test = load_dataset("allenai/openbookqa", "main", split="test")
arc_easy_test = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
arc_chal_test = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")

len(openbookqa_test), len(arc_easy_test), len(arc_chal_test)



(500, 2376, 1172)

In [4]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
#tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    quantization_config=bnb_config,
)

model = PeftModel.from_pretrained(base, "qwen25_obqa_lora_1/checkpoint-1550")
if device == "cpu":
    model = model.to(device)

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

In [5]:
CANON = ["A", "B", "C", "D", "E", "F"]

def canonicalize_labels(question, labels, choices, answer_key):
    # map from original label -> canonical label by position
    mapping = {orig: CANON[i] for i, orig in enumerate(labels)}
    canon_labels = [mapping[l] for l in labels]
    canon_answer = mapping[answer_key]
    return {
        "question": question,
        "labels": canon_labels,
        "choices": choices,
        "answer_key": canon_answer,
    }

def norm_openbookqa_canon(ex):
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return canonicalize_labels(ex["question_stem"], labels, choices, answer_key)

def norm_arc_canon(ex):
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer_key = ex["answerKey"]
    return canonicalize_labels(ex["question"], labels, choices, answer_key)

openbookqa_norm = openbookqa_test.map(norm_openbookqa_canon, remove_columns=openbookqa_test.column_names)
arc_easy_norm = arc_easy_test.map(norm_arc_canon, remove_columns=arc_easy_test.column_names)
arc_chal_norm = arc_chal_test.map(norm_arc_canon, remove_columns=arc_chal_test.column_names)

openbookqa_norm[0], arc_easy_norm[0], arc_chal_norm[0]

({'choices': ['make more phone calls',
   'quit eating lunch out',
   'buy less with monopoly money',
   'have lunch with friends'],
  'question': 'A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to',
  'labels': ['A', 'B', 'C', 'D'],
  'answer_key': 'B'},
 {'question': 'Which statement best explains why photosynthesis is the foundation of most food webs?',
  'choices': ['Sunlight is the source of energy for nearly all ecosystems.',
   'Most ecosystems are found on land instead of in water.',
   'Carbon dioxide is more available than other gases.',
   'The producers in all ecosystems are plants.'],
  'labels': ['A', 'B', 'C', 'D'],
  'answer_key': 'A'},
 {'question': 'An astronomer observes that a planet rotates faster after a meteorite impact. Which is the most likely effect of this increase in rotation?',
  'choices': ['Planetary density will de

In [6]:
def build_prompt(question, labels, choices):
    # Ensure consistent ordering
    opts = "\n".join([f"{l}. {c}" for l, c in zip(labels, choices)])
    user = (
        "Answer the multiple-choice question.\n"
        "Reply with exactly one line in this format:\n"
        "Final answer: <LETTER>\n\n"
        f"Question: {question}\n"
        f"Choices:\n{opts}\n"
    )
    messages = [
        {"role": "system", "content": "You are a helpful assistant that follows the format exactly."},
        {"role": "user", "content": user},
    ]
    # Qwen2.5-Instruct supports chat template
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [7]:
@torch.no_grad()
def generate_text(prompt, max_new_tokens=32):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,          # deterministic
        temperature=1.0,
        top_p=1.0,
        eos_token_id=tokenizer.eos_token_id,
    )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    # Return only the tail after the prompt where possible
    return text

In [8]:
ANSWER_RE = re.compile(r"\b([A-D])\b")

def extract_letter(text, valid_labels={"A","B","C","D"}):
    matches = [m.group(1) for m in ANSWER_RE.finditer(text)]
    for label in reversed(matches):
        if label in valid_labels:
            return label
    return None


In [9]:
pynvml.nvmlInit()
HANDLE = pynvml.nvmlDeviceGetHandleByIndex(0)

def sample_power(handle):
    # returns power in Watts
    return pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0

def evaluate(ds, sample_dt=0.1):
    correct = 0
    total = len(ds)

    power_samples = []
    t_start = time.time()
    last_sample = t_start

    for ex in ds:
        now = time.time()
        if now - last_sample >= sample_dt:
            power_samples.append(sample_power(HANDLE))
            last_sample = now

        prompt = build_prompt(ex["question"], ex["labels"], ex["choices"])
        output = generate_text(prompt)
        pred = extract_letter(output)

        if pred == None:
            print(output)
        if pred == ex["answer_key"]:
            correct += 1

    t_end = time.time()

    # --- metrics ---
    duration = t_end - t_start
    avg_power = sum(power_samples) / len(power_samples)
    energy_joules = avg_power * duration

    return {
        "accuracy": correct / total,
        "time_sec": duration,
        "avg_power_w": avg_power,
        "energy_j": energy_joules,
    }


In [14]:
acc_obqa = evaluate(openbookqa_norm)
acc_obqa

{'accuracy': 0.812,
 'time_sec': 113.47864484786987,
 'avg_power_w': 59.24662324649298,
 'energy_j': 6723.226517824328}

In [15]:
acc_arc_easy = evaluate(arc_easy_norm)
acc_arc_easy

{'accuracy': 0.8413299663299664,
 'time_sec': 558.2292950153351,
 'avg_power_w': 60.29607621052632,
 'energy_j': 33659.03611519303}

In [16]:
acc_arc_challenge = evaluate(arc_chal_norm)
acc_arc_challenge

{'accuracy': 0.7491467576791809,
 'time_sec': 284.16956639289856,
 'avg_power_w': 59.51501878736123,
 'energy_j': 16912.357082669652}