In [1]:
# -------------------- INSTALL DEPENDENCIES --------------------
!pip install -q unsloth peft transformers datasets accelerate bitsandbytes polars rouge_score nltk evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m29.7 MB/s[0m eta [3

In [2]:
# -------------------- IMPORTS --------------------
import os
import torch
import math
import json
import polars as pl
import numpy as np
from datasets import Dataset
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import download
from typing import List
from peft import LoraConfig
from unsloth import FastLanguageModel
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# -------------------- DOWNLOAD NLTK DATA --------------------
try:
    download('punkt')
except:
    pass

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# -------------------- DEVICE --------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda


In [6]:
# -------------------- LOAD TOKENIZER & MODEL --------------------
model_name = "unsloth/SmolLM-1.7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Use Unsloth to load model with **4-bit QLoRA** config
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=512,
    dtype=torch.bfloat16,     # or torch.float16 if needed
    load_in_4bit=True         # That’s enough for Unsloth QLoRA!
)

==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
# -------------------- PREPARE LoRA --------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r=lora_config.r,
    target_modules=lora_config.target_modules,
    lora_alpha=lora_config.lora_alpha,
    lora_dropout=lora_config.lora_dropout,
    bias=lora_config.bias,
)

model.print_trainable_parameters()

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.6.12 patched 24 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


trainable params: 18,087,936 || all params: 1,729,464,320 || trainable%: 1.0459


In [10]:
# -------------------- LOAD ANIME QA DATASET --------------------
anime_list = [
    "aot", "naruto", "onepiece", "hellsing", "dr_stone", "berserk", "evangelion",
    "darling-in-the-franxx", "frieren", "gundam_00", "kurokonobasuke", "chainsawman", "onepunch"
]

In [11]:
formatted_examples = []
for anime in anime_list:
    path = f"hf://datasets/theblackcat102/anime-understanding-dataset/{anime}_dev.jsonl"
    df = pl.read_ndjson(path).to_pandas()
    for row in df.to_dict(orient="records"):
        correct = row[row["answer"]]
        messages = [
            {"role": "user", "content": f"Answer this question about {anime} anime:\n\n{row['question']}\n\nA) {row['A']}\nB) {row['B']}\nC) {row['C']}\nD) {row['D']}"},
            {"role": "assistant", "content": f"The correct answer is {row['answer']}) {correct}"}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False)
        formatted_examples.append({"text": text, "correct_letter": row["answer"], "answer": correct})

dataset = Dataset.from_list(formatted_examples)

In [12]:
# -------------------- TOKENIZE DATA --------------------
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=False, max_length=512)

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [13]:
# -------------------- TRAINING ARGUMENTS --------------------
args = TrainingArguments(
    output_dir="unsloth_qlora_anime_qa",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=15,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=collator,
    tokenizer=None
)
trainer.label_names = ["labels"]

  trainer = Trainer(


In [15]:
# -------------------- TRAIN --------------------
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 65 | Num Epochs = 15 | Total steps = 135
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 18,087,936 of 1,700,000,000 (1.06% trained)


Step,Training Loss
10,2.2267
20,1.8019
30,1.6266
40,1.5328
50,1.2954
60,1.2175
70,1.1523
80,0.9905
90,0.8027
100,0.6918


TrainOutput(global_step=135, training_loss=1.1244723461292407, metrics={'train_runtime': 115.5225, 'train_samples_per_second': 8.44, 'train_steps_per_second': 1.169, 'total_flos': 1054681239060480.0, 'train_loss': 1.1244723461292407, 'epoch': 15.0})

In [16]:
# -------------------- SAVE FINAL MODEL --------------------
model.save_pretrained("unsloth_qlora_anime_qa")
tokenizer.save_pretrained("unsloth_qlora_anime_qa")

('unsloth_qlora_anime_qa/tokenizer_config.json',
 'unsloth_qlora_anime_qa/special_tokens_map.json',
 'unsloth_qlora_anime_qa/chat_template.jinja',
 'unsloth_qlora_anime_qa/vocab.json',
 'unsloth_qlora_anime_qa/merges.txt',
 'unsloth_qlora_anime_qa/added_tokens.json',
 'unsloth_qlora_anime_qa/tokenizer.json')

In [17]:
# -------------------- EVALUATION --------------------
class PerplexityEvaluator:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def compute(self, prompt, target):
        full = prompt + " " + target
        full_ids = self.tokenizer(full, return_tensors="pt").input_ids.to(self.device)
        prompt_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
        context_len = prompt_ids.shape[1]
        with torch.no_grad():
            out = self.model(full_ids, labels=full_ids)
            logits = out.logits[0, context_len-1:-1]
            target_ids = full_ids[0, context_len:]
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            selected = log_probs.gather(1, target_ids.unsqueeze(1)).squeeze(1)
            return torch.exp(-selected.mean()).item()

In [18]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
smooth = SmoothingFunction().method1

eval_model = model.merge_and_unload()
eval_model.eval()
evaluator = PerplexityEvaluator(eval_model, tokenizer, device)

examples = dataset.select(range(15))
rouge_1s, rouge_Ls, bleus, perplexities = [], [], [], []

for ex in examples:
    text = ex["text"]
    parts = text.split("<|im_start|>assistant\n")
    if len(parts) != 2:
        continue
    prompt = parts[0] + "<|im_start|>assistant\n"
    answer = parts[1].replace("<|im_end|>", "").strip()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    gen_ids = eval_model.generate(input_ids, max_new_tokens=60, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(gen_ids[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

    ppl = evaluator.compute(prompt, answer)
    perplexities.append(ppl)

    rouge = scorer.score(answer, output)
    rouge_1s.append(rouge["rouge1"].fmeasure)
    rouge_Ls.append(rouge["rougeL"].fmeasure)

    bleus.append(sentence_bleu([answer.split()], output.split(), smoothing_function=smooth))
    print(f"Prompt: {prompt}")
    print(f"Answer: {answer}")
    print(f"Output: {output}")
    print(f"Perplexity: {ppl:.2f}")
    print(f"ROUGE-1: {rouge['rouge1'].fmeasure:.3f}")
    print(f"ROUGE-L: {rouge['rougeL'].fmeasure:.3f}")
    print(f"BLEU: {sentence_bleu([answer.split()], output.split(), smoothing_function=smooth):.3f}")
    print("----------------------------------")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: <|im_start|>user
Answer this question about aot anime:

Eren Jaeger's downfall was heavily influenced by:

A) His inability to control his temper
B) His desire for peace and harmony
C) His knowledge of the future and the inevitability of its outcome
D) His ambition to become a Marleyan officer<|im_end|>
<|im_start|>assistant

Answer: The correct answer is C) His knowledge of the future and the inevitability of its outcome
Output: The answer is C) His knowledge of the future and the inevitability of its outcome.
Perplexity: 1.91
ROUGE-1: 0.968
ROUGE-L: 0.968
BLEU: 0.811
----------------------------------
Prompt: <|im_start|>user
Answer this question about aot anime:

How did Ymir save Daz during the snowstorm exercise?

A) By carrying him on her back
B) By using her Titan form
C) By leaving him behind to get help
D) The text does not specify how Ymir saved Daz<|im_end|>
<|im_start|>assistant

Answer: The correct answer is D) The text does not specify how Ymir saved Daz
Output: T

In [19]:
print("--- Evaluation Summary ---")
print(f"Average Perplexity: {np.mean(perplexities):.2f}")
print(f"Average ROUGE-1: {np.mean(rouge_1s):.3f}")
print(f"Average ROUGE-L: {np.mean(rouge_Ls):.3f}")
print(f"Average BLEU: {np.mean(bleus):.3f}")

--- Evaluation Summary ---
Average Perplexity: 2.79
Average ROUGE-1: 0.524
Average ROUGE-L: 0.518
Average BLEU: 0.227
