# 1. Load Model, Tokenizer, and Dataset

## Original

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from processors import EmotionProcessor
from datasets import load_dataset

add_special_tokens = False

# Load model & tokenizer
model_path = "google/gemma-3-4b-it"
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load and process data
test_dataset = load_dataset("dair-ai/emotion", split="test")
processor = EmotionProcessor()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [example["messages"][0]],  # only the user
        tokenize=False,
        add_generation_prompt=True,
        continue_final_message=False,
    )
    return {"prompt": prompt, "labels": example["messages"][1]["content"]}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

## TRL

In [1]:
import torch
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
from processors import QPLDecomposerProcessor
from datasets import load_dataset

add_special_tokens = False

# Load model & tokenizer
model_path = "output/gemma-3-4b-it-question_decomposer_ds_train_batch_size=1_gradient_accumulation_steps=1_learning_rate=0.0002_num_train_epochs=2_gradient_checkpointing=False_logging_steps=500_save_steps=5000_random_seed=1_lora=True_r=16_alpha=32_dropout=0.05/checkpoint-20958"
model = AutoPeftModelForCausalLM.from_pretrained(model_path).cuda()
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load and process data
test_dataset = load_dataset("bgunlp/question_decomposer_ds", split="validation")
processor = QPLDecomposerProcessor()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [example["messages"][0]],  # only the user
        tokenize=False,
        add_generation_prompt=True,
        continue_final_message=False,
    )
    return {"prompt": prompt, "labels": example["messages"][1]["content"]}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/3028 [00:00<?, ? examples/s]



Map:   0%|          | 0/3028 [00:00<?, ? examples/s]

## Unsloth

In [3]:
# Loading unsloth
from unsloth import FastModel, get_chat_template, standardize_data_formats
from datasets import load_dataset
from processors import EmotionProcessor

add_special_tokens = True

# Load model & tokenizer
model_path = "output/unsloth/gemma-3-4b-it-emotion_train_batch_size=1_gradient_accumulation_steps=1_learning_rate=0.0002_num_train_epochs=2_gradient_checkpointing=False_logging_steps=500_save_steps=5000_random_seed=1_lora=True_r=16_alpha=32_dropout=0.05_2025-05-01_12-56-00/checkpoint-32000"
model, tokenizer = FastModel.from_pretrained(
    model_name=model_path,
    max_seq_length=1024,
    load_in_4bit=False,
    dtype=None,
)
model = FastModel.for_inference(model)
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3"
)

# Load and process data
test_dataset = load_dataset("dair-ai/emotion", split="test")
processor = EmotionProcessor()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
test_dataset = test_dataset.rename_column("messages", "conversations")
test_dataset = standardize_data_formats(test_dataset)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [example["conversations"][0]],  # only the user
        add_generation_prompt=True,
    )
    return {'prompt': prompt, 'labels': example["conversations"][1]["content"]}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

==((====))==  Unsloth 2025.4.1: Fast Gemma3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.643 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# Evaluate

## One Example Sanity Check

In [5]:
# one example
example = test_dataset[0]

model_inputs = tokenizer(
    [example["prompt"]], 
    padding=True,
    padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
    return_tensors="pt",
    add_special_tokens=add_special_tokens,
).to("cuda")

generation_ids = model.generate(**model_inputs, max_new_tokens=200, do_sample=False)
generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]  # remove the input ids
model_first_output = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)[0]

print("## Prompt")
print(example["prompt"])
print("\n\n## Labels")
print(example["labels"])
print("\n\n## Model Output")
print(model_first_output)



## Prompt
<bos><start_of_turn>user
Below is a piece of text. Classify it into one of: sadness, joy, love, anger, fear, surprise.

"im feeling rather rotten so im not very ambitious right now"<end_of_turn>
<start_of_turn>model



## Labels
The emotion in the above text is: sadness


## Model Output
The emotion in the following the opposite in the following thel only sadness despite the realming the realming the emotion in the following the newly good sadness in the Ex  sadnessing sadness in the Namibenseing happiness emotions in the Namib equing sadness in the following the emotion in the following theisticus againing te sadness in thelr emotional sadness in the Examin in the kingdoming happiness sadness/ sadness  sadness and sadness and sadnessys: sadness alone sadness and sadness! sadnessing sadness os: sadness  sadness  sadness  sadness  sadnessing sadness  sadnessing sadnessing sadnessing sadness and sadnessing sadness in the out er sadness> sadness  sadness  sadness in the Figure i

## Test Dataset - Strict Output Format

In [None]:
from tqdm import tqdm
import torch

bsz = 16
correct = 0
with torch.no_grad():
    for i in tqdm(range(0, len(test_dataset), bsz)):
        batch = test_dataset[i:i+ bsz]
        model_inputs = tokenizer(
            batch["prompt"], 
            padding=True,
            padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
        ).to("cuda")
        generation_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
        generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]
        model_outputs = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)
        correct += sum([1 if model_outputs[j] == batch["labels"][j] else 0 for j in range(len(batch['prompt']))])

print(f"Accuracy: {correct / len(test_dataset)}")

## Test Dataset - Flexible Output Format

In [None]:
from tqdm import tqdm
import torch

labels = ["sadness","joy","love","anger","fear","surprise"]

bsz = 16
correct = 0
with torch.no_grad():
    for i in tqdm(range(0, len(test_dataset), bsz)):
        batch = test_dataset[i:i+ bsz]
        model_inputs = tokenizer(
            batch["prompt"], 
            padding=True,
            padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
            return_tensors="pt",
            add_special_tokens=False,
        ).to("cuda")
        generation_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
        generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]
        model_outputs = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)
        for j in range(len(batch['prompt'])):
            label = batch["labels"][j][34:]
            model_output = model_outputs[j].lower()
            model_answers = [model_output.find(label) for label in labels]
            try:
                best_answer = model_answers.index(min([ans for ans in model_answers if ans != -1]))
            except ValueError:
                continue
            model_prediction = labels[best_answer]
            if model_prediction == label:
                correct += 1

print(f"Accuracy: {correct / len(test_dataset)}")