# Supervised Fine Tuning of llama

This notebook implements a custom training loop for the llama models

In [1]:
import torch
import torch.nn.functional as F

import os
from datetime import datetime

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

from peft import LoraConfig
import bitsandbytes as bnb

from datasets import load_dataset

print(f'Is bf16 supported: {torch.cuda.is_bf16_supported()}')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Is bf16 supported: True
Using device: cuda


# Model ID
* **NOTE:** You need to have access to these models
* Visit huggingface page and request access
* Set your token via setting the environment variable HF_TOKEN

In [2]:
model_id = 'meta-llama/Llama-3.2-1B-Instruct'

# Quantization Settings

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 4 bit quantization
    bnb_4bit_quant_type='nf4', # normalized float 4 bit
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
)

# Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ['HF_TOKEN'] # required for some models
)

### Special Tokens

In [5]:
tokenizer.special_tokens_map

{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>'}

### Terminators List

In [6]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids('<|eot_id|>')
]

# Model 

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    device_map={'':0}, 
    token=os.environ['HF_TOKEN'] # required for some models
)

In [8]:
model.config.use_cache = False # prevent return tuple 'past_key_values'

### Open Ended Generation

In [9]:
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = tokenizer.pad_token_id

## Utility Function for Preprocessing

* llama 3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
* llama 3.1 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1

In [10]:
# NOTE: <|begin_of_text|> is added by tokenizer ???

training_prompt = lambda user_instruction, assistant_response: \
f"""<|start_header_id|>system<|end_header_id|>
You are a helpful assistant.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
{user_instruction} <|eot_id|>

<|start_header_id|>assistant<|end_header_id|> 
{assistant_response} <|eot_id|>"""


generation_prompt = lambda user_instruction: \
f"""<|start_header_id|>system<|end_header_id|>
You are a helpful assistant.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
{user_instruction} <|eot_id|>

<|start_header_id|>assistant<|end_header_id|>"""

# Generation

In [11]:
@torch.amp.autocast('cuda')
def generate(
    prompt, 
    eos_token_id=terminators,
    max_new_tokens=128,
    do_sample=True,
    temperature=0.90,
    top_p=0.9,
    num_beams=1,
    num_return_sequences=1,
    skip_special_tokens=True
):

    model.eval()
    
    input_text = generation_prompt(prompt)
    
    inputs = tokenizer(input_text, return_tensors='pt').to(device)
    input_token_len = len(inputs['input_ids'][0])
    
    outputs = model.generate(
        **inputs, # input token ids and attention mask (as kwargs)
        max_new_tokens=max_new_tokens,
        eos_token_id=eos_token_id,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        return_dict_in_generate=True,
        output_scores=False,
    )

    #print(f'LEN: {len(outputs)}')
    #print('KEYS', outputs.keys())
    #print(f'Scores: {outputs["sequences_scores"]}')
    
    for _gen_output in outputs['sequences']:
        _generated_response = tokenizer.decode(_gen_output[input_token_len:], skip_special_tokens=skip_special_tokens)
                
        print(_generated_response)
        print('#'*35)
        print()


@torch.amp.autocast('cuda')
def return_model_output(prompt, cut=True):
    model.eval()
    
    text = generation_prompt(prompt)
    
    inputs = tokenizer(text, return_tensors='pt').to(device)
    input_token_len = len(inputs['input_ids'][0])
    
    return model(**inputs)

In [12]:
with torch.no_grad():
    pred = return_model_output('hello world')
    print(pred.keys())
    logits = pred['logits']

odict_keys(['logits'])


# Training

In [13]:
@torch.amp.autocast('cuda')
def train(model, optimizer, num_epochs, user_instruction, assistant_response):
    full_prompt = training_prompt(user_instruction, assistant_response)

    inputs = tokenizer(full_prompt, return_tensors='pt').to(device)    
    # make shifted
    input_ids = inputs['input_ids'][:, :-1]
    attention_mask = inputs['attention_mask'][:, :-1]
    target_ids = inputs['input_ids'][:, 1:]

    model.train()
    
    for epoch in range(1, num_epochs+1):
        optimizer.zero_grad()

        pred_ids = model(input_ids=input_ids, attention_mask=attention_mask)['logits']
        
        loss = F.cross_entropy(
            pred_ids.view(-1, pred_ids.shape[-1]),
            target_ids.view(-1),
        )
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        optimizer.step()

        print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')

### Optimizer

In [14]:
optim = bnb.optim.Adam8bit(
    model.parameters(), 
    lr=1e-4, 
    is_paged=True
)

### Start Training

In [15]:
USER_INSTRUCTION = ['Türkiyenin en güzel şehiri neresidir?']
ASSISTANT_RESPONSE = ["Türkiyenin şehiri Çanakkale'dir."]

NUM_EPOCHS = 20

#### Generation Before Training

In [16]:
for _instruction in USER_INSTRUCTION:
    print(f'Instruction: {_instruction}')
    generate(_instruction)

Instruction: Türkiyenin en güzel şehiri neresidir?


Bursa!

Bursa, Türkiye'nin en güzeli ve en beautiful şehriです。 Bursa, İstanbul ile bir ikiye yakınıにある, Balkan Yaradası'nın kuzeyinde yer alan ve iklimi tropik olarak bir birीकlidir. Bursa, Istanbul'ın en beautiful ve en küçük şehriye de cheerslidir.
###################################



In [17]:
train(model, optim, NUM_EPOCHS, USER_INSTRUCTION, ASSISTANT_RESPONSE)

Epoch: 1, Loss: 7.2135
Epoch: 2, Loss: 6.3118
Epoch: 3, Loss: 5.5616
Epoch: 4, Loss: 4.9536
Epoch: 5, Loss: 4.4676
Epoch: 6, Loss: 4.0281
Epoch: 7, Loss: 3.6435
Epoch: 8, Loss: 3.2946
Epoch: 9, Loss: 2.9835
Epoch: 10, Loss: 2.6946
Epoch: 11, Loss: 2.4259
Epoch: 12, Loss: 2.1721
Epoch: 13, Loss: 1.9278
Epoch: 14, Loss: 1.7092
Epoch: 15, Loss: 1.5211
Epoch: 16, Loss: 1.3678
Epoch: 17, Loss: 1.2323
Epoch: 18, Loss: 1.1024
Epoch: 19, Loss: 0.9749
Epoch: 20, Loss: 0.8584


#### Generation After Training

In [18]:
for _instruction in USER_INSTRUCTION:
    print(f'Instruction: {_instruction}')
    generate(_instruction)

Instruction: Türkiyenin en güzel şehiri neresidir?
 
Türkiyenin en güzel şehiri Çanakkale'dir  2011 year'de 3.7 milyonlúkurlu Çanakkale'dir  2018 year'de 4.2 milyonlúkurlu  Çanakkale'dir  2019 year'de 3.5 milyonlúkurlu  Çanakkale'dir  2018 year'de 4.2 milyonlúkurlu  Çanakkale'dir  2019 year'de 3
###################################

