In [1]:
!pip install transformers datasets accelerate hf_xet --q

In [42]:
from tqdm.notebook import tqdm

In [2]:
CACHE_DIR = "/content/drive/MyDrive/coding/projects/LLM Finetuning + MMLU/hf_cache"

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-1.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=CACHE_DIR,
    device_map="auto",
    torch_dtype="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from datasets import load_dataset

mmlu = load_dataset("cais/mmlu", "all", cache_dir=CACHE_DIR)

In [12]:
mmlu.shape

{'test': (14042, 4),
 'validation': (1531, 4),
 'dev': (285, 4),
 'auxiliary_train': (99842, 4)}

In [44]:
mmlu_subset = mmlu["test"].filter(lambda ex: ex["subject"] in ["high_school_mathematics", "us_foreign_policy"])
mmlu_subset.shape

(370, 4)

In [54]:
def format_example(example):
    q = example["question"]
    choices = example["choices"]
    prompt = "Solve the question and answer within <answer> tag.\n"
    prompt += f"Question: {q}\n"
    for i, letter in enumerate("ABCD"):
        prompt += f"{letter}. {choices[i]}\n"
    # prompt += "Answer:"
    return prompt

print(format_example(mmlu_subset[0]))

Think about the question and answer in <answer> tag.
Question: If a pentagon P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is
A. (0, – 3)
B. (4, 1)
C. (2, 2)
D. (– 4, –2)



In [60]:
import torch

def predict(prompt, format=True):
    if format:
      prompt = format_example(prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id
        )
    generated = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated.strip()

In [61]:
predict("tell me a joke", format=False)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'about the moon\n\nOkay'

# USE A INSTRUCTION MODEL. THANK ME LATER.

In [55]:
test_N = 100
mmlu_subset = mmlu["test"].filter(lambda ex: ex["subject"] in ["high_school_mathematics", "us_foreign_policy"])
mmlu_subset = mmlu_subset.select(range(test_N))

preds = [predict(ex) for ex in tqdm(mmlu_subset)]

  0%|          | 0/100 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

In [50]:
def normalize(pred):
    for c in "ABCD":
        if c in pred:
            if c == "A": return 1
            elif c == "B": return 2
            elif c == "C": return 3
            elif c == "D": return 4
    return "?"

answers = [ex["answer"] for ex in mmlu_subset]
norm_preds = [normalize(p) for p in preds]

correct = sum(p == a for p, a in zip(norm_preds, answers))
print(f"Accuracy: {correct / len(answers):.2f}")

Accuracy: 0.00


In [None]:
def normalize(pred):
    pred.split("<answer>")
    for c in "ABCD":
        if c in pred:
            if c == "A": return 1
            elif c == "B": return 2
            elif c == "C": return 3
            elif c == "D": return 4
    return "?"

answers = [ex["answer"] for ex in mmlu_subset]
norm_preds = [normalize(p) for p in preds]

correct = sum(p == a for p, a in zip(norm_preds, answers))
print(f"Accuracy: {correct / len(answers):.2f}")

In [58]:
preds[2]

'E. 32'

In [41]:
for i, j in zip(norm_preds, answers):
  print(i, j, '|')

3 3 |
3 2 |
1 0 |
2 1 |
3 2 |
1 1 |
2 2 |
1 0 |
3 2 |
4 1 |
2 3 |
4 3 |
2 3 |
3 1 |
4 3 |
4 3 |
4 0 |
2 1 |
4 0 |
1 1 |
4 3 |
3 1 |
2 3 |
2 1 |
2 0 |
3 2 |
1 3 |
3 2 |
4 2 |
2 3 |
3 2 |
2 0 |
3 3 |
1 0 |
2 0 |
1 3 |
3 2 |
3 2 |
1 0 |
1 0 |
4 1 |
3 2 |
4 1 |
3 2 |
2 2 |
2 3 |
4 3 |
2 0 |
3 3 |
2 1 |
1 3 |
1 0 |
2 1 |
3 2 |
1 2 |
2 0 |
4 1 |
? 1 |
2 1 |
2 2 |
3 3 |
2 1 |
4 0 |
2 3 |
3 2 |
1 1 |
2 2 |
3 1 |
2 3 |
1 0 |
3 2 |
2 1 |
1 0 |
2 2 |
4 1 |
4 3 |
4 3 |
3 0 |
1 2 |
2 1 |
2 1 |
2 0 |
1 2 |
1 3 |
3 3 |
2 1 |
4 3 |
1 0 |
1 3 |
4 2 |
2 1 |
2 2 |
1 1 |
2 3 |
3 0 |
1 0 |
4 0 |
2 1 |
1 2 |
2 0 |
