In [1]:
import re
import random
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainerCallback, BitsAndBytesConfig
import torch
import time
import pynvml


  import pynvml  # type: ignore[import]


In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
openbookqa_train = load_dataset("allenai/openbookqa", "main", split="train")
openbookqa_validation = load_dataset("allenai/openbookqa", "main", split="validation")
openbookqa_test = load_dataset("allenai/openbookqa", "main", split="test")

arc_easy_test = load_dataset("allenai/ai2_arc", "ARC-Easy", split="test")
arc_chal_test = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test")

len(openbookqa_train), len(openbookqa_validation)

(4957, 500)

In [4]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
#tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    quantization_config=bnb_config,
)
if device == "cpu":
    model = model.to(device)

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

In [5]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [6]:
CANON = ["A", "B", "C", "D"]

def obqa_prompt_answer(ex):
    labels = list(ex["choices"]["label"])
    choices = list(ex["choices"]["text"])
    answer = ex["answerKey"]

    mapping = {orig: CANON[i] for i, orig in enumerate(labels)}
    opts = "\n".join([f"{mapping[l]}. {c}" for l, c in zip(labels, choices)])

    prompt = (
        "Answer the multiple-choice question.\n"
        "Reply with exactly one line in this format:\n"
        "Final answer: <LETTER>\n\n"
        f"Question: {ex['question_stem']}\n"
        f"Choices:\n{opts}\n"
        "\n"  # model should answer next
    )
    completion = f"Final answer: {mapping[answer]}"

    return {"prompt": prompt, "completion": completion}

In [7]:
def tokenize_masked(batch, max_length=512):
    prompts = batch["prompt"]
    completions = batch["completion"]

    input_ids_list = []
    attn_list = []
    labels_list = []

    for p, c in zip(prompts, completions):
        # tokenize separately (no special tokens inserted twice)
        p_ids = tokenizer(p, add_special_tokens=False).input_ids
        c_ids = tokenizer(c, add_special_tokens=False).input_ids

        # add EOS to completion so model learns to stop
        c_ids = c_ids + [tokenizer.eos_token_id]

        input_ids = p_ids + c_ids
        if len(input_ids) > max_length:
            # truncate from the left of the prompt (keep completion)
            # ensure we don't truncate completion away
            keep = max_length
            input_ids = input_ids[-keep:]
            # recompute boundary approximately: we will mask everything
            # except last len(c_ids) tokens (safe if completion retained)
            c_len = len(c_ids)
            labels = [-100] * (len(input_ids) - c_len) + input_ids[-c_len:]
        else:
            labels = [-100] * len(p_ids) + c_ids

        attention_mask = [1] * len(input_ids)

        input_ids_list.append(input_ids)
        attn_list.append(attention_mask)
        labels_list.append(labels)

    return {"input_ids": input_ids_list, "attention_mask": attn_list, "labels": labels_list}


In [8]:
train_pa = openbookqa_train.map(obqa_prompt_answer, remove_columns=openbookqa_train.column_names)
val_pa   = openbookqa_validation.map(obqa_prompt_answer, remove_columns=openbookqa_validation.column_names)

train_tok = train_pa.map(lambda b: tokenize_masked(b, max_length=256), batched=True, remove_columns=train_pa.column_names)
val_tok   = val_pa.map(lambda b: tokenize_masked(b, max_length=256), batched=True, remove_columns=val_pa.column_names)

In [9]:
train_tok

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4957
})

In [10]:
train_tok[0]

{'input_ids': [16141,
  279,
  5248,
  62626,
  3405,
  624,
  20841,
  448,
  6896,
  825,
  1555,
  304,
  419,
  3561,
  510,
  19357,
  4226,
  25,
  366,
  20756,
  4198,
  1339,
  14582,
  25,
  576,
  7015,
  374,
  8480,
  369,
  198,
  89283,
  510,
  32,
  13,
  56625,
  6832,
  501,
  28762,
  198,
  33,
  13,
  2841,
  7826,
  705,
  323,
  3709,
  2310,
  198,
  34,
  13,
  19281,
  30231,
  1280,
  304,
  264,
  92384,
  198,
  35,
  13,
  10779,
  8151,
  10909,
  11,
  14211,
  17765,
  323,
  30231,
  1280,
  271,
  19357,
  4226,
  25,
  422,
  151645],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  -100,
  -100,

In [11]:
decoded_input = tokenizer.decode(
    train_tok[0]["input_ids"],
    skip_special_tokens=False
)

print(decoded_input)


Answer the multiple-choice question.
Reply with exactly one line in this format:
Final answer: <LETTER>

Question: The sun is responsible for
Choices:
A. puppies learning new tricks
B. children growing up and getting old
C. flowers wilting in a vase
D. plants sprouting, blooming and wilting

Final answer: D<|im_end|>


In [10]:
from dataclasses import dataclass
from typing import Any, List, Dict
import torch

@dataclass
class CausalLMDataCollator:
    tokenizer: Any
    pad_to_multiple_of: int | None = None

    def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]:
        # Pad inputs using tokenizer utilities
        batch_inputs = self.tokenizer.pad(
            [{"input_ids": f["input_ids"], "attention_mask": f["attention_mask"]} for f in features],
            padding=True,
            return_tensors="pt",
            pad_to_multiple_of=self.pad_to_multiple_of,
        )

        max_len = batch_inputs["input_ids"].shape[1]

        # Pad labels with -100
        labels = []
        for f in features:
            lab = f["labels"]
            labels.append(lab + [-100] * (max_len - len(lab)))

        batch_inputs["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch_inputs

data_collator = CausalLMDataCollator(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if device == "cuda" else None
)

In [11]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 745,472 || all params: 1,544,459,776 || trainable%: 0.0483


In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen25_obqa_lora_1",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    num_train_epochs=5,
    fp16=(device == "cuda"),
    logging_steps=50,
    eval_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    optim="paged_adamw_8bit",
    group_by_length=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [13]:
pynvml.nvmlInit()
HANDLE = pynvml.nvmlDeviceGetHandleByIndex(0)

def sample_power_w():
    return pynvml.nvmlDeviceGetPowerUsage(HANDLE) / 1000.0

class PowerCallback(TrainerCallback):
    def __init__(self, handle, every_steps=10):
        self.handle = handle
        self.every_steps = every_steps
        self.powers = []
        self.times = []
        self.t0 = None
        self.t1 = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.t0 = time.time()

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.every_steps == 0:
            self.powers.append(pynvml.nvmlDeviceGetPowerUsage(self.handle) / 1000.0)
            self.times.append(time.time())

    def on_train_end(self, args, state, control, **kwargs):
        self.t1 = time.time()

def summarize_training_power(cb: PowerCallback):
    duration = (cb.t1 - cb.t0) if (cb.t0 is not None and cb.t1 is not None) else None
    if not cb.powers:
        return {"train_time_sec": duration, "avg_power_w": None, "energy_j": None}

    avg_power = sum(cb.powers) / len(cb.powers)
    # energy approximation across sampled interval
    sampled_duration = (cb.times[-1] - cb.times[0]) if len(cb.times) >= 2 else duration
    energy_j = avg_power * sampled_duration if sampled_duration is not None else None

    return {"train_time_sec": duration, "avg_power_w": avg_power, "energy_j": energy_j}


In [14]:
power_cb = PowerCallback(HANDLE, every_steps=10)
trainer.add_callback(power_cb)

train_out = trainer.train()
train_metrics = summarize_training_power(power_cb)

train_out, train_metrics

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
  return fn(*args, **kwargs)


Step,Training Loss
50,1.083583
100,0.145839
150,0.125434
200,0.114912
250,0.101968
300,0.095368
350,0.080197
400,0.073164
450,0.080665
500,0.079707


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


(TrainOutput(global_step=1550, training_loss=0.08732984756269763, metrics={'train_runtime': 5904.3318, 'train_samples_per_second': 4.198, 'train_steps_per_second': 0.263, 'total_flos': 1.39507949862912e+16, 'train_loss': 0.08732984756269763, 'epoch': 5.0}),
 {'train_time_sec': 5904.330672979355,
  'avg_power_w': 54.62311612903226,
  'energy_j': 320232.88870997267})