# Instruction Finetuning

In this notebook, we will look into how to perform instruction finetuning. We will be doing full finetuning, i.e., retraining all the paramters of the model.

In [None]:
!pip install transformers accelerate peft trl huggingface-hub evaluate datasets bitsandbytes einops wandb tensorboard tiktoken pandas numpy scipy matplotlib sentencepiece

In [None]:
!pip install --upgrade transformers==4.38.2 datasets==2.16.1 accelerate==0.26.1 evaluate==0.4.1 bitsandbytes==0.42.0 trl==0.7.11 peft==0.8.2

In [None]:
!pip install torch==2.1.2 tensorboard

Load the required libraries

In [None]:
import os
os.environ["WANDB_PROJECT"]="tinyllama_instruct_finetuning"

from enum import Enum
from functools import partial
import pandas as pd
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer

## Data preprocessing: Creating Datasets and Dataloaders

In [None]:
# Load dataset
dataset_name = "HuggingFaceH4/no_robots"
dataset = load_dataset(dataset_name)

# Print available keys to confirm correct splits
print("Dataset splits:", dataset.keys())  # Should output: dict_keys(['train', 'test'])

# Define model and tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define chat template
template = """{% for message in messages %}
{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
tokenizer.chat_template = template

# Preprocessing function
def preprocess(samples):
    batch = []
    for conversation in samples["messages"]:
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"content": batch}

# Use correct split names ('train' and 'test')
dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names  # Use 'train' instead of 'train_sft'
)

# Assign splits correctly
dataset["train"] = dataset["train"]  # No need to rename
dataset["test"] = dataset["test"]  # No need to rename

# Print dataset structure
print(dataset)
print(dataset["train"][0])  # Print first training example


In [None]:
dataset["test"][:10]

## Loading the pretrained model and tokenizer

In [None]:
class ChatmlSpecialTokens(str, Enum):
    user = "<|im_start|>user"
    assistant = "<|im_start|>assistant"
    system = "<|im_start|>system"
    eos_token = "<|im_end|>"
    bos_token = "<s>"
    pad_token = "<pad>"

    @classmethod
    def list(cls):
        return [c.value for c in cls]

tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        pad_token=ChatmlSpecialTokens.pad_token.value,
        bos_token=ChatmlSpecialTokens.bos_token.value,
        eos_token=ChatmlSpecialTokens.eos_token.value,
        additional_special_tokens=ChatmlSpecialTokens.list(),
        trust_remote_code=True
    )
tokenizer.chat_template = template
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

## Storing the base model predictions on a subset of 25 samples from eval test

In [None]:
tokenizer.padding_side="left"
def get_prediction_batched(samples, column_name):
    batch = []
    for conversation in samples["messages"]:
        chatml_gen_prompt = tokenizer.apply_chat_template(conversation[:-1], tokenize=False, add_generation_prompt=True)
        batch.append(chatml_gen_prompt)
    #text = tokenizer.apply_chat_template(conversation_history, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)#, add_special_tokens=False)
    inputs = {k: v.to("cuda") for k,v in inputs.items()}
    outputs = model.generate(**inputs, 
                             max_new_tokens=100, 
                             do_sample=True, 
                             top_p=0.95, 
                             temperature=0.2, 
                             repetition_penalty=1.1, 
                             eos_token_id=tokenizer.eos_token_id,
                             pad_token_id=tokenizer.eos_token_id,
                            )
    outputs = tokenizer.batch_decode(outputs)
    outputs = [output.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip() for output in outputs]
    return {column_name: outputs}


In [None]:
model.to("cuda")
test_dataset = load_dataset(dataset_name)["test"].shuffle().select(range(25))
test_dataset = test_dataset.map(
    partial(get_prediction_batched, column_name="base_assistant_message"),
    batched=True,
    batch_size=1)

print(test_dataset)
print(test_dataset[0])

## Training

In [98]:
output_dir = "tinyllama_instruct"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 16
logging_steps = 25
learning_rate = 2e-5
max_grad_norm = 1.0
max_steps = 250
num_train_epochs=1
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    # packing=True,
    # dataset_text_field="content",
    # max_seq_length=max_seq_length,
)


  trainer = SFTTrainer(


Tokenizing train dataset:   0%|          | 0/9500 [00:00<?, ? examples/s]

KeyError: 'text'

In [None]:
dataset

In [None]:
# !pip install --upgrade transformers

In [None]:
import wandb
wandb.init()
trainer.train()

In [None]:
trainer.save_model()

In [None]:
!nvidia-smi

## Loading the trained model and getting the predictions of the trained model

In [None]:
model = AutoModelForCausalLM.from_pretrained("smangrul/tinyllama_instruct", trust_remote_code=True)
model.to("cuda")
model.to(torch.float16)
model.eval()

In [None]:
test_dataset = test_dataset.map(
    partial(get_prediction_batched, column_name="instruct_assistant_message"),
    batched=True,
    batch_size=1)

print(test_dataset)
print(test_dataset[0])

## Comparing the outputs of base model and instruction finetuned model

In [None]:
test_dataset = test_dataset.to_pandas()

In [None]:
pd.set_option("max_colwidth", 300)
test_dataset[["messages", "base_assistant_message", "instruct_assistant_message"]][:25]

In [None]:
messages = [
    {"role": "user", "content": "What an essay on Generative AI."},
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(text, return_tensors="pt")#, add_special_tokens=False)
inputs = {k: v.to("cuda") for k,v in inputs.items()}
outputs = model.generate(**inputs, 
                         max_new_tokens=2000, 
                         do_sample=True, 
                         top_p=0.95, 
                         temperature=0.2, 
                         repetition_penalty=1.1, 
                         eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))

In [None]:
!nvidia-smi