# 1. Load Model, Tokenizer, and Dataset

## Original

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from processors import EmotionProcessor
from datasets import load_dataset

add_special_tokens = False

# Load model & tokenizer
model_path = "google/gemma-3-4b-it"
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load and process data
test_dataset = load_dataset("dair-ai/emotion", split="test")
processor = EmotionProcessor()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [example["messages"][0]],  # only the user
        tokenize=False,
        add_generation_prompt=True,
        continue_final_message=False,
    )
    return {"prompt": prompt, "labels": example["messages"][1]["content"]}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

## TRL

In [1]:
import torch
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
from processors import ProcessorRegistry
from datasets import load_dataset

add_special_tokens = False

model_path = "output/erbz0056_gemma-3-4b-it-qpl_composer_train_batch_size=1_gradient_accumulation_steps=8_learning_rate=0.0002_num_train_epochs=4_gradient_checkpointing=True_logging_steps=0.05_save_steps=0.5_random_seed=1_lora=True_r=16_alpha=32_dropout=0.05/checkpoint-5316"
dataset_id = "d4nieldev/qpl_composer"

# Load model & tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(model_path).cuda()
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load and process data
test_dataset = load_dataset(dataset_id, split="validation")
processor = ProcessorRegistry.get(dataset_id)()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [msg for msg in example["messages"] if msg['role'] in ['system', 'user']],  # only the system and user
        tokenize=False,
        add_generation_prompt=True,
        continue_final_message=False,
    )
    return {"prompt": prompt, "labels": next(msg['content'] for msg in example['messages'] if msg['role'] == 'assistant')}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/2753 [00:00<?, ? examples/s]

## Unsloth

In [None]:
# Loading unsloth
from unsloth import FastModel, get_chat_template, standardize_data_formats
from datasets import load_dataset
from processors import EmotionProcessor

add_special_tokens = True

# Load model & tokenizer
model_path = "output/unsloth/gemma-3-4b-it-emotion_train_batch_size=1_gradient_accumulation_steps=1_learning_rate=0.0002_num_train_epochs=2_gradient_checkpointing=False_logging_steps=500_save_steps=5000_random_seed=1_lora=True_r=16_alpha=32_dropout=0.05_2025-05-01_12-56-00/checkpoint-32000"
model, tokenizer = FastModel.from_pretrained(
    model_name=model_path,
    max_seq_length=1024,
    load_in_4bit=False,
    dtype=None,
)
model = FastModel.for_inference(model)
tokenizer = get_chat_template(
    tokenizer,
    chat_template="gemma-3"
)

# Load and process data
test_dataset = load_dataset("dair-ai/emotion", split="test")
processor = EmotionProcessor()
test_dataset = test_dataset.map(processor.to_chat_template, remove_columns=test_dataset.column_names)
test_dataset = test_dataset.rename_column("messages", "conversations")
test_dataset = standardize_data_formats(test_dataset)
def to_model_prompt(example):
    # example["messages"] is a list of {"role": "...", "content": "..."}
    prompt = tokenizer.apply_chat_template(
        [example["conversations"][0]],  # only the user
        add_generation_prompt=True,
    )
    return {'prompt': prompt, 'labels': example["conversations"][1]["content"]}
test_dataset = test_dataset.map(to_model_prompt, remove_columns=test_dataset.column_names)

# Evaluate

## One Example Sanity Check

In [15]:
# one example
example = test_dataset[53]

model_inputs = tokenizer(
    [example["prompt"]], 
    padding=True,
    padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
    return_tensors="pt",
    add_special_tokens=add_special_tokens,
).to("cuda")

generation_ids = model.generate(**model_inputs, max_new_tokens=200, do_sample=False)
generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]  # remove the input ids
model_first_output = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)[0]

print("## Prompt")
print(example["prompt"])
print("\n\n## Labels")
print(example["labels"])
print("\n\n## Model Output")
print(model_first_output)

## Prompt
<bos><start_of_turn>user
Given a database schema, a QPL query prefix, and a natural language question, complete the final line of the query so it completes the user request.

QPL is a formalism used to describe data retrieval operations over an SQL schema in a modular manner.
A QPL plan is a sequence of instructions for querying tabular data to answer a natural language question.

Below is the formal specification for each operation in valid QPL:
<qpl> ::= <line>+
<line> ::= #<integer> = <operator>
<operator> ::= <scan> | <aggregate> | <filter> | <sort> | <topsort> | <join> | <except> | <intersect> | <union>

-- Leaf operator
<scan> ::= Scan Table [ <table-name> ] <pred>? <distinct>? <output-non-qualif>

-- Unary operators
<aggregate> ::= Aggregate [ <input> ] <group-by>? <output-non-qualif>
<filter> ::= Filter [ <input> ] <pred> <distinct>? <output-non-qualif>
<sort> ::= Sort [ <input> ] <order-by> <withTie>? <output-non-qualif>
<topsort> ::= TopSort [ <input> ] Rows [ <numb

## Emotion

### Strict Output Format

In [None]:
from tqdm import tqdm
import torch

bsz = 16
correct = 0
with torch.no_grad():
    for i in tqdm(range(0, len(test_dataset), bsz)):
        batch = test_dataset[i:i+ bsz]
        model_inputs = tokenizer(
            batch["prompt"], 
            padding=True,
            padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
        ).to("cuda")
        generation_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
        generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]
        model_outputs = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)
        correct += sum([1 if model_outputs[j] == batch["labels"][j] else 0 for j in range(len(batch['prompt']))])

print(f"Accuracy: {correct / len(test_dataset)}")

### Flexible Output Format

In [None]:
from tqdm import tqdm
import torch

labels = ["sadness","joy","love","anger","fear","surprise"]

bsz = 16
correct = 0
with torch.no_grad():
    for i in tqdm(range(0, len(test_dataset), bsz)):
        batch = test_dataset[i:i+ bsz]
        model_inputs = tokenizer(
            batch["prompt"], 
            padding=True,
            padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
            return_tensors="pt",
            add_special_tokens=False,
        ).to("cuda")
        generation_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
        generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]
        model_outputs = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)
        for j in range(len(batch['prompt'])):
            label = batch["labels"][j][34:]
            model_output = model_outputs[j].lower()
            model_answers = [model_output.find(label) for label in labels]
            try:
                best_answer = model_answers.index(min([ans for ans in model_answers if ans != -1]))
            except ValueError:
                continue
            model_prediction = labels[best_answer]
            if model_prediction == label:
                correct += 1

print(f"Accuracy: {correct / len(test_dataset)}")

## Test Decomposer

In [None]:
from tqdm import tqdm
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report

bsz = 8
op_correct = 0
sum_similarity = 0
sentences_count = 0

op_to_id = {
    'aggregate': 0,
    'except': 1,
    'filter': 2,
    'intersect': 3,
    'join': 4,
    'scan': 5,
    'sort': 6,
    'topsort': 7,
    'union': 8,
    'other': 9
}
id_to_op = {v: k for k, v in op_to_id.items()}

emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

y_pred = []
y_true = []

with torch.no_grad():
    for i in tqdm(range(0, len(test_dataset), bsz)):
        batch = test_dataset[i:i+ bsz]
        model_inputs = tokenizer(
            batch["prompt"], 
            padding=True,
            padding_side="left",  # https://huggingface.co/docs/transformers/llm_tutorial?padding=right+pad#padding-side
            return_tensors="pt",
            add_special_tokens=add_special_tokens,
        ).to("cuda")
        generation_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
        generation_ids = generation_ids[:, model_inputs["input_ids"].shape[1]:]
        model_outputs = tokenizer.batch_decode(generation_ids, skip_special_tokens=True)

        for model_output, label in zip(model_outputs, batch["labels"]):
            model_lines = model_output.split("\n")
            label_lines = label.split("\n")

            model_op_id = op_to_id.get(model_lines[0].lower(), op_to_id["other"])
            label_op_id = op_to_id.get(label_lines[0].lower(), op_to_id["other"])
            y_pred.append(model_op_id)
            y_true.append(label_op_id)

            if model_op_id == label_op_id:
                op_correct += 1

                model_sentences = model_lines[1:]
                label_sentences = label_lines[1:]

                sentences_count += len(label_sentences)

                if len(model_sentences) != len(label_sentences):
                    print("======================")
                    print(model_output)
                    print("----")
                    print(label)
                    print("======================")
                else:
                    all_sentences = model_sentences + label_sentences
                    embeddings = emb_model.encode(all_sentences)
                    similarity_matrix = embeddings @ embeddings.T
                    if len(model_sentences) == 0:
                        similarity = 0
                    elif len(model_sentences) == 1:
                        similarity = similarity_matrix[0][1]
                    else:
                        similarity = max(
                            similarity_matrix[0,2] + similarity_matrix[1,3],
                            similarity_matrix[0,3] + similarity_matrix[1,2]
                        ) / 2
                    sum_similarity += similarity

print(f"Operator Accuracy: {op_correct / len(test_dataset)}")
print(f"Sentence Similarity (when operator is correct): {sum_similarity / op_correct}")

# Generate a classification report
report = classification_report(y_true, y_pred, output_dict=True)

# Print nicely
import pandas as pd
df = pd.DataFrame(report).transpose()
df.index = df.index.map(lambda x: id_to_op[int(x)] if x.isdigit() else x)
df['support'] = df['support'].astype(int)
print(df)

cm = confusion_matrix(y_true, y_pred, labels=list(op_to_id.values()))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(op_to_id.keys()))
disp.plot(cmap=plt.cm.Blues)

plt.xticks(rotation=45)
plt.show()