In [1]:
# ---------DEPENDENCIES-----------
!pip install -q transformers accelerate datasets peft bitsandbytes trl evaluate rouge_score nltk polars

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.7 MB/s[0m eta [36m

In [2]:
# ------------QLoRA Fine-Tuning for Anime QA Dataset with Perplexity, ROUGE, BLEU Evaluation-----------
import os
import torch
import math
import json
import polars as pl
import numpy as np
import logging
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk import download
from typing import List
# ---------NLTK SETUP-----------
try:
    download('punkt')
except:
    pass
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger("qlora-anime-qa")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# ---------DEVICE SETUP-----------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [4]:
# ---------LOAD TOKENIZER & MODEL (QLoRA READY)-----------
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

In [6]:
# Quantization Guide:
# Use one of the following configurations depending on your precision/efficiency goals:
#
# 4-bit Quantization (QLoRA): Use NF4 quant type with bfloat16 compute for best performance
# To switch to 8-bit, use `load_in_8bit=True` in quantization_config instead
# For pure FP16 or BF16, skip quantization_config and set torch_dtype + bf16=True in TrainingArguments
# 4-bit QLoRA (Recommended for memory efficiency + training performance):
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
#
# 8-bit Quantization (Good for inference-only memory savings):
#     bnb_config = BitsAndBytesConfig(
#         load_in_8bit=True
#     )
#
# No quantization (Full precision FP16/BF16 training):
#     # Remove `quantization_config` entirely
#     # Instead, use:
#     torch_dtype=torch.bfloat16 or torch.float16
#     bf16=True (in TrainingArguments)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16
)

base_model = prepare_model_for_kbit_training(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [8]:
# ---------LORA CONFIG-----------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 18,087,936 || all params: 1,729,464,320 || trainable%: 1.0459


In [9]:
# ---------LOAD ANIME DATASET-----------
anime_list = [
    "aot", "naruto", "onepiece", "hellsing", "dr_stone", "berserk", "evangelion",
    "darling-in-the-franxx", "frieren", "gundam_00", "kurokonobasuke", "chainsawman", "onepunch"
]

formatted_examples = []
for anime in anime_list:
    path = f"hf://datasets/theblackcat102/anime-understanding-dataset/{anime}_dev.jsonl"
    df = pl.read_ndjson(path).to_pandas()
    for row in df.to_dict(orient="records"):
        correct = row[row["answer"]]
        messages = [
            {"role": "user", "content": f"Answer this question about {anime} anime:\n\n{row['question']}\n\nA) {row['A']}\nB) {row['B']}\nC) {row['C']}\nD) {row['D']}"},
            {"role": "assistant", "content": f"The correct answer is {row['answer']}) {correct}"}
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False)
        formatted_examples.append({"text": text, "correct_letter": row["answer"], "answer": correct})

dataset = Dataset.from_list(formatted_examples)

In [10]:
# ---------TOKENIZE DATA-----------
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding=False, max_length=512)

tokenized = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=8)

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [26]:
# ---------TRAINING CONFIG-----------
args = TrainingArguments(
    output_dir="qlora_anime_qa_model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=15,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

In [27]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    data_collator=collator,
    compute_metrics=None,
    preprocess_logits_for_metrics=None,
    tokenizer=None,  # Don’t pass tokenizer to avoid tokenizer deprecation
)
trainer.label_names = ["labels"]  # Explicitly set to suppress warning

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [28]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,2.3559
20,2.3237
30,2.4339
40,2.4286
50,2.3265
60,2.3597
70,2.4519
80,2.4078
90,2.3081
100,2.3543


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*a

TrainOutput(global_step=135, training_loss=2.3836050104211877, metrics={'train_runtime': 73.1002, 'train_samples_per_second': 13.338, 'train_steps_per_second': 1.847, 'total_flos': 1042968938741760.0, 'train_loss': 2.3836050104211877, 'epoch': 15.0})

In [None]:
# ---------SAVE MODEL-----------
model.save_pretrained("qlora_anime_qa_model")
tokenizer.save_pretrained("qlora_anime_qa_model")

In [30]:
# ---------EVALUATION METRICS-----------
class PerplexityEvaluator:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device

    def compute(self, prompt, target):
        full = prompt + " " + target
        full_ids = self.tokenizer(full, return_tensors="pt").input_ids.to(self.device)
        prompt_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
        context_len = prompt_ids.shape[1]
        with torch.no_grad():
            out = self.model(full_ids, labels=full_ids)
            logits = out.logits[0, context_len-1:-1]
            target_ids = full_ids[0, context_len:]
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            selected = log_probs.gather(1, target_ids.unsqueeze(1)).squeeze(1)
            return torch.exp(-selected.mean()).item()

scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
smooth = SmoothingFunction().method1

In [31]:
eval_model = model.merge_and_unload()
eval_model.eval()
evaluator = PerplexityEvaluator(eval_model, tokenizer, device)

In [32]:
examples = dataset.select(range(15))
rouge_1s, rouge_Ls, bleus, perplexities = [], [], [], []

In [33]:
for ex in examples:
    text = ex["text"]
    parts = text.split("<|im_start|>assistant\n")
    if len(parts) != 2:
        continue
    prompt = parts[0] + "<|im_start|>assistant\n"
    answer = parts[1].replace("<|im_end|>", "").strip()

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    gen_ids = eval_model.generate(input_ids, max_new_tokens=60, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(gen_ids[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

    ppl = evaluator.compute(prompt, answer)
    perplexities.append(ppl)

    rouge = scorer.score(answer, output)
    rouge_1s.append(rouge["rouge1"].fmeasure)
    rouge_Ls.append(rouge["rougeL"].fmeasure)

    bleus.append(sentence_bleu([answer.split()], output.split(), smoothing_function=smooth))
    print(f"Prompt: {prompt}")
    print(f"Answer: {answer}")
    print(f"Output: {output}")
    print(f"Perplexity: {ppl:.2f}")
    print(f"ROUGE-1: {rouge['rouge1'].fmeasure:.3f}")
    print(f"ROUGE-L: {rouge['rougeL'].fmeasure:.3f}")
    print(f"BLEU: {sentence_bleu([answer.split()], output.split(), smoothing_function=smooth):.3f}")
    print("----------------------------------")

Prompt: <|im_start|>user
Answer this question about aot anime:

Eren Jaeger's downfall was heavily influenced by:

A) His inability to control his temper
B) His desire for peace and harmony
C) His knowledge of the future and the inevitability of its outcome
D) His ambition to become a Marleyan officer<|im_end|>
<|im_start|>assistant

Answer: The correct answer is C) His knowledge of the future and the inevitability of its outcome
Output: The answer is D) His ambition to become a Marleyan officer.
Perplexity: 1.92
ROUGE-1: 0.296
ROUGE-L: 0.296
BLEU: 0.030
----------------------------------
Prompt: <|im_start|>user
Answer this question about aot anime:

How did Ymir save Daz during the snowstorm exercise?

A) By carrying him on her back
B) By using her Titan form
C) By leaving him behind to get help
D) The text does not specify how Ymir saved Daz<|im_end|>
<|im_start|>assistant

Answer: The correct answer is D) The text does not specify how Ymir saved Daz
Output: The answer is C) By leav

In [34]:
print("--- Evaluation Summary ---")
print(f"Average Perplexity: {np.mean(perplexities):.2f}")
print(f"Average ROUGE-1: {np.mean(rouge_1s):.3f}")
print(f"Average ROUGE-L: {np.mean(rouge_Ls):.3f}")
print(f"Average BLEU: {np.mean(bleus):.3f}")

--- Evaluation Summary ---
Average Perplexity: 2.80
Average ROUGE-1: 0.521
Average ROUGE-L: 0.515
Average BLEU: 0.199
