# Week 10: LLMs

In [1]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

---

In [2]:
def temperature_scaling(logits, temperature=1.0):
    probs = torch.softmax(logits / temperature, dim=-1)
    return probs

def greedy_decoding(probs):
    return torch.argmax(probs, dim=-1)

----

## Benchmark on MMLU

In this exercise, we will benchmark a LLM on the MMLU dataset. We will use the `tinyBenchmarks` dataset which contains a small subset of the MMLU dataset.


In [3]:
# First we load the dataset
ds = load_dataset("tinyBenchmarks/tinyMMLU")

In [4]:
# Let's look at the first example
dataset = ds["test"]
dataset[0]

{'question': 'The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building time of 83 days. With what confidence can we assert that the average building time for a new house is between 80 and 90 days?',
 'subject': 'high_school_statistics',
 'choices': ['15.4%', '17.8%', '20.0%', '82.1%'],
 'answer': 3,

In [5]:
# Make sure each example has 4 choices
for sample in dataset:
    assert len(sample["choices"]) == 4

We need to encode each sample into a multiple choice format. Additionally, we need to create a chat template to format the input for the LLM.

In [6]:
# Modified encode function to create proper chat messages
OPTIONS = ["A", "B", "C", "D"]

def encode(examples):
    inputs = {"messages": [], "label": []}
    for idx, question in enumerate(examples["question"]):
        # Format the question and options
        text = f"Question: {question}\n"
        options = "".join([f"{OPTIONS[i]}: {examples['choices'][idx][i]}\n" for i in range(4)])
        text += options
        text += "Answer:"
        
        # Create chat messages
        messages = [
            {"role": "user", "content": text},
        ]
        inputs["messages"].append(messages)
        inputs["label"].append(OPTIONS[examples["answer"][idx]])
    return inputs

In [7]:
# Use the map function to apply the encode function to the dataset
dataset = dataset.map(encode, batched=True, batch_size=10)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
# Let's look at the first example again
dataset[0]

{'question': 'The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building time of 83 days. With what confidence can we assert that the average building time for a new house is between 80 and 90 days?',
 'subject': 'high_school_statistics',
 'choices': ['15.4%', '17.8%', '20.0%', '82.1%'],
 'answer': 3,
 'messages': [{'content': 'Question: The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building time of 83 days. With what confidence can we assert that the average building time for a new house is between 80 and 90 days?\nA: 15.4%\nB: 17.8%\nC: 20.0%\nD: 82.1%\nAnswer:',
   'role': 'user'}],
 'label': 'D'}

We can now load the model and tokenizer.

In [9]:
# Load a model and tokenizer
model_name_or_path = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 2048, padding_idx=2)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
 

Now we need to apply the chat template to the dataset. We use the tokenizer for that.

In [10]:
# Apply the chat template to the dataset
def apply_chat_template(examples):
    return {
        "text": [tokenizer.apply_chat_template(messages, tokenize=False) for messages in examples["messages"]],
        "input_ids": [tokenizer.apply_chat_template(messages, tokenize=True) for messages in examples["messages"]]
    }

dataset = dataset.map(apply_chat_template, batched=True, batch_size=2)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
# Let's look at the first example again
dataset[0]

{'question': 'The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building time of 83 days. With what confidence can we assert that the average building time for a new house is between 80 and 90 days?',
 'subject': 'high_school_statistics',
 'choices': ['15.4%', '17.8%', '20.0%', '82.1%'],
 'answer': 3,
 'messages': [{'content': 'Question: The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building time of 83 days. With what confidence can we assert that the average building time for a new house is between 80 and 90 days?\nA: 15.4%\nB: 17.8%\nC: 20.0%\nD: 82.1%\nAnswer:',
   'role': 'user'}],
 'label': 'D',
 'text': '<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n<|im_start|>user\nQuestion: The number of days it takes to build a new house has a variance of 386. A sample of 40 new homes shows an average building

We will now generate predictions for each sample. We will iterate over the dataset one example at a time and generate predictions for each example.


In [12]:
# run forward pass
N = 5 # we stop after N examples
max_tokens = 10 # we generate max_tokens predictions for each example
predictions = {} # collect predictions for each sample
temperature = 1.0   

with torch.no_grad():
    for idx, sample in enumerate(dataset):
        print("Generating predictions for sample", idx)
        predictions[idx] = []
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0)

        for _ in range(max_tokens):
            
            outputs = model.forward(input_ids)
            logits = outputs.logits
            # get the logits for the last token
            logits_last_token = logits[:, -1, :]
            logits_last_token.shape

            # sample a token from the distribution
            probs = temperature_scaling(logits_last_token, temperature=temperature)
            preds = greedy_decoding(probs)

            # convert prediction to token
            preds_tokens = tokenizer.convert_ids_to_tokens(preds.tolist())
            predictions[idx].append(preds_tokens)

            # append prediction to input_ids
            input_ids = torch.cat([input_ids, preds.unsqueeze(0)], dim=1)

        if idx >= N:
            break

Generating predictions for sample 0
Generating predictions for sample 1
Generating predictions for sample 2
Generating predictions for sample 3
Generating predictions for sample 4
Generating predictions for sample 5


In [13]:
predictions[0]

[['<|im_start|>'],
 ['ass'],
 ['istant'],
 ['Ċ'],
 ['C'],
 [':'],
 ['Ġ'],
 ['2'],
 ['0'],
 ['.']]

In [16]:
# Compute accuracy
accuracies = []
for idx in predictions:
    preds = predictions[idx]
    print(preds)

    # get the label for the sample
    label = dataset[idx]["label"]
    print(label)

    # did the model predict the correct answer?
    correct = int(label in preds)
    accuracies.append(correct)

[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['C'], [':'], ['Ġ'], ['2'], ['0'], ['.']]
D
[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['C'], [':'], ['ĠLosing'], ['Ġ'], ['5'], ['-']]
C
[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['D'], [':'], ['ĠIt'], ['Ġcauses'], ['Ġa'], ['Ġmore']]
B
[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['D'], [':'], ['ĠIt'], ['Ġis'], ['Ġthe'], ['Ġonly']]
B
[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['To'], ['Ġsolve'], ['Ġthis'], ['Ġproblem'], [','], ['Ġwe']]
C
[['<|im_start|>'], ['ass'], ['istant'], ['Ċ'], ['A'], [':'], ['ĠStates'], ['Ġshould'], ['Ġgive'], ['Ġspecial']]
C


In [17]:
# calculate accuracy
accuracy = sum(accuracies) / len(accuracies)
print(accuracy)

0.0
