## Installing Dependencies

In [1]:

!pip install -q --upgrade bitsandbytes
!pip install -q transformers accelerate datasets sentence-transformers pandas tqdm peft trl torch matplotlib

import locale
locale.getpreferredencoding = lambda: "UTF-8"

import torch
import pandas as pd
import string
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:


SAMPLES_FOR_TRAIN = 1000
SAMPLES_FOR_TEST = 100
MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
OUTPUT_DIR = "./trivia_sft_model"

print(f"Model: {MODEL_ID}")


Model: HuggingFaceTB/SmolLM2-1.7B-Instruct


## Evaluation technique

In [3]:

class FuzzyEvaluator:

    def normalize_answer(text):
        text = str(text).lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = " ".join([w for w in text.split() if w not in ["a", "an", "the"]])
        return " ".join(text.split())
    def evaluate_triviaqa(prediction, ground_truth):
        pred = FuzzyEvaluator.normalize_answer(prediction)
        truths = [ground_truth] if isinstance(ground_truth, str) else ground_truth

        for truth in truths:
            truth_norm = FuzzyEvaluator.normalize_answer(truth)
            if not truth_norm:
                continue

            if truth_norm == pred or truth_norm in pred:
                return True

        return False


evaluator = FuzzyEvaluator()


## Data set Preparation

In [4]:

def prepare_trivia_data(dataset, start_idx, count):
    data = []
    subset = dataset.select(range(start_idx, start_idx + count))
    for row in subset:
        answer = row["answer"]["aliases"][0]
        context_snippets = row["search_results"]["search_context"]
        full_context = " ||| ".join(context_snippets[:3]) if context_snippets else ""
        data.append(
            {
                "question": row["question"],
                "answer": answer,
                "context": full_context,
            }
        )
    return pd.DataFrame(data)


print("Loading TriviaQA dataset...")
ds_trivia = load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

df_trivia_train = prepare_trivia_data(ds_trivia, 0, SAMPLES_FOR_TRAIN)
df_trivia_test = prepare_trivia_data(ds_trivia, SAMPLES_FOR_TRAIN, SAMPLES_FOR_TEST)

print(f"Dataset loaded: {len(df_trivia_train)} train, {len(df_trivia_test)} test")


Loading TriviaQA dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

rc/train-00000-of-00026.parquet:   0%|          | 0.00/308M [00:00<?, ?B/s]

rc/train-00001-of-00026.parquet:   0%|          | 0.00/298M [00:00<?, ?B/s]

rc/train-00002-of-00026.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

rc/train-00003-of-00026.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

rc/train-00004-of-00026.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

rc/train-00005-of-00026.parquet:   0%|          | 0.00/474M [00:00<?, ?B/s]

rc/train-00006-of-00026.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

rc/train-00007-of-00026.parquet:   0%|          | 0.00/324M [00:00<?, ?B/s]

rc/train-00008-of-00026.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

rc/train-00009-of-00026.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

rc/train-00010-of-00026.parquet:   0%|          | 0.00/400M [00:00<?, ?B/s]

rc/train-00011-of-00026.parquet:   0%|          | 0.00/370M [00:00<?, ?B/s]

rc/train-00012-of-00026.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

rc/train-00013-of-00026.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

rc/train-00014-of-00026.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

rc/train-00015-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

rc/train-00016-of-00026.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

rc/train-00017-of-00026.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

rc/train-00018-of-00026.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

rc/train-00019-of-00026.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

rc/train-00020-of-00026.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

rc/train-00021-of-00026.parquet:   0%|          | 0.00/153M [00:00<?, ?B/s]

rc/train-00022-of-00026.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

rc/train-00023-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

rc/train-00024-of-00026.parquet:   0%|          | 0.00/154M [00:00<?, ?B/s]

rc/train-00025-of-00026.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

rc/validation-00000-of-00004.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

rc/validation-00001-of-00004.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

rc/validation-00002-of-00004.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

rc/validation-00003-of-00004.parquet:   0%|          | 0.00/129M [00:00<?, ?B/s]

rc/test-00000-of-00004.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

rc/test-00001-of-00004.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

rc/test-00002-of-00004.parquet:   0%|          | 0.00/171M [00:00<?, ?B/s]

rc/test-00003-of-00004.parquet:   0%|          | 0.00/128M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Dataset loaded: 1000 train, 100 test


In [5]:
# Function to format question, context, and optional answer into an instruction style prompt.

def format_instruction(question, context, answer=None):
    context_snippet = context.split(" ||| ")[0][:400] if context else ""

    prompt = (
        "Answer the following question based on the context provided.\n\n"
        f"Context: {context_snippet}\n\n"
        f"Question: {question}\n\n"
        "Answer:"
    )

    if answer:
        prompt += f" {answer}"

    return prompt


In [6]:


def prepare_dataset_for_training(df):
    texts = []
    for _, row in df.iterrows():
        text = format_instruction(row["question"], row["context"], row["answer"])
        texts.append({"text": text})
    return Dataset.from_list(texts)


train_dataset = prepare_dataset_for_training(df_trivia_train)
print(f"Training dataset prepared: {len(train_dataset)} samples")


# Function to load base model and tokenizer and apply LoRA adapters.
def load_model_for_training():
    print(f"\nLoading {MODEL_ID} for fine-tuning...")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )

    model.gradient_checkpointing_enable()

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model, tokenizer


model, tokenizer = load_model_for_training()


Training dataset prepared: 1000 samples

Loading HuggingFaceTB/SmolLM2-1.7B-Instruct for fine-tuning...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

trainable params: 6,291,456 || all params: 1,717,667,840 || trainable%: 0.3663


In [7]:

# Function to tokenize text examples for the model
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length",
    )


print("Tokenizing dataset...")
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)


Tokenizing dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Training

In [8]:


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    warmup_steps=50,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    report_to="none",
    gradient_checkpointing=True,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [9]:
print("STARTING FINE-TUNING")


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()


model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


The model is already on multiple devices. Skipping the move to device specified in `args`.


STARTING FINE-TUNING


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.3563
20,2.2476
30,2.1219
40,1.9529
50,1.9575
60,1.9405
70,1.854
80,1.8047
90,1.7826
100,1.7431



Fine-tuning complete!
Model saved to ./trivia_sft_model


In [10]:

def generate_answer(model, tokenizer, question, context):
    prompt = format_instruction(question, context)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    gen_kwargs = {
        "max_new_tokens": 50,
        "temperature": 0.3,
        "do_sample": True,
        "top_p": 0.9,
        "pad_token_id": tokenizer.pad_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    }

    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "Answer:" in full_text:
        answer = full_text.split("Answer:")[-1].strip()
    else:
        answer = full_text[len(prompt) :].strip()

    return answer[:200].strip()


## Evaluation

In [19]:
def evaluate_model():
    print("\nStarting evaluation on test dataset...")
    print(f"Total samples: {len(df_trivia_test)}\n")

    model.eval()
    results = []

    for idx, row in df_trivia_test.iterrows():
        question = row["question"]
        context = row["context"]
        ground_truth = row["answer"]

        if (idx + 1) % 10 == 0:
          print(f"sample {idx + 1}/{len(df_trivia_test)}")
          print(f"Question: {question[:80]}...")

        prediction = generate_answer(model, tokenizer, question, context)
        is_correct = evaluator.evaluate_triviaqa(prediction, ground_truth)

        results.append(
            {
                "Question": question,
                "Ground_Truth": ground_truth,
                "Prediction": prediction,
                "Correct": is_correct,
            }
        )

    df_results = pd.DataFrame(results)
    accuracy = df_results["Correct"].mean() * 100.0

    output_file = "sft_triviaqa_results.csv"
    df_results.to_csv(output_file, index=False)

    print("\nEvaluation complete.")
    print(f"Final Accuracy: {accuracy:.2f}%")
    print(f"Results saved to: {output_file}")

    return accuracy


In [20]:
accuracy = evaluate_model()


Starting evaluation on test dataset...
Total samples: 100

sample 10/100
Question: Who was Oliver North's immediate boss who admitted authori8zing funding the Cont...
sample 20/100
Question: Dan Quayle was Senator form which state when he was chosen as George Bush's Vice...
sample 30/100
Question: In which sport did Andy Thomson become a world champion?...
sample 40/100
Question: Who was the oldest US Open golf champion of the 20th century?...
sample 50/100
Question: Which team won the most Super Bowls in the 1980s?...
sample 60/100
Question: What is a message sent to a newsgroup in the Internet called?|...
sample 70/100
Question: A small a in a circle is pronounced how?...
sample 80/100
Question: What is Dionne Warwick's real first name?...
sample 90/100
Question: Who was the last undisputed boxing world heavyweight champion before Mike Tyson?...
sample 100/100
Question: In which country was Emilio Estevez born?...

Evaluation complete.
Final Accuracy: 12.00%
Results saved to: sft_tr