# Week 10: LLMs

In [51]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

---

In [171]:
def temperature_scaling(logits, temperature=1.0):
    probs = torch.softmax(logits / temperature, dim=-1)
    return probs

def greedy_decoding(probs):
    return torch.argmax(probs, dim=-1)

----

## Benchmark on MMLU

In this exercise, we will benchmark a LLM on the MMLU dataset. We will use the `tinyBenchmarks` dataset which contains a small subset of the MMLU dataset.


In [146]:
# First we load the dataset
ds = load_dataset("tinyBenchmarks/tinyMMLU")

In [None]:
# Let's look at the first example
dataset = ds["test"]
dataset[0]

In [148]:
# Make sure each example has 4 choices
for sample in dataset:
    assert len(sample["choices"]) == 4

We need to encode each sample into a multiple choice format. Additionally, we need to create a chat template to format the input for the LLM.

In [177]:
# Modified encode function to create proper chat messages
OPTIONS = ["A", "B", "C", "D"]

def encode(examples):
    inputs = {"messages": [], "label": []}
    for idx, question in enumerate(examples["question"]):
        # Format the question and options
        # The text should be the question followed by the options and an answer placeholder
        # It should look like this:

        # Question: What is the capital of France?
        # A: Paris
        # B: London
        # C: Rome
        # D: Madrid
        # Answer:

        # TODO: implement this
        text = ""
        
        # Create chat messages
        messages = [
            {"role": "user", "content": text},
        ]
        inputs["messages"].append(messages)
        inputs["label"].append(OPTIONS[examples["answer"][idx]])
    
    return inputs

In [None]:
# Use the map function to apply the encode function to the dataset
dataset = dataset.map(encode, batched=True, batch_size=10)

In [None]:
# Let's look at the first example again
dataset[0]

We can now load the model and tokenizer.

In [151]:
# Load a model and tokenizer
model_name_or_path = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model.eval()

Now we need to apply the chat template to the dataset. We use the tokenizer for that.

In [None]:
# Apply the chat template to the dataset
def apply_chat_template(examples):
    return {
        "text": [tokenizer.apply_chat_template(messages, tokenize=False) for messages in examples["messages"]],
        "input_ids": [tokenizer.apply_chat_template(messages, tokenize=True) for messages in examples["messages"]]
    }

dataset = dataset.map(apply_chat_template, batched=True, batch_size=2)

In [None]:
# Let's look at the first example again
dataset[0]

We will now generate predictions for each sample. We will iterate over the dataset one example at a time and generate predictions for each example.


In [None]:
# run forward pass
N = 5 # we stop after N examples
max_tokens = 10 # we generate max_tokens predictions for each example
predictions = {} # collect predictions for each sample
temperature = 0.7   

with torch.no_grad():
    for idx, sample in enumerate(dataset):
        print("Generating predictions for sample", idx)
        predictions[idx] = []
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0)

        for _ in range(max_tokens):
            # TODO: implement this
            # Run a forward pass and get the logits for the last token
            

            # sample a token from the distribution
            probs = temperature_scaling(logits_last_token, temperature=temperature)
            preds = greedy_decoding(probs)

            # convert prediction to token
            preds_tokens = tokenizer.convert_ids_to_tokens(preds.tolist())
            predictions[idx].append(preds_tokens)

            # append prediction to input_ids
            input_ids = torch.cat([input_ids, preds.unsqueeze(0)], dim=1)

        if idx >= N:
            break

In [None]:
predictions[0]

In [None]:
# Compute accuracy
accuracies = []
for idx in predictions:
    # TODO: implement this
    # Get the predictions for the sample and check whether the correct answer is in the predictions
    correct = 0
    accuracies.append(correct)

In [None]:
# calculate accuracy
accuracy = sum(accuracies) / len(accuracies)
print(accuracy)