<a href="https://colab.research.google.com/github/chris-hoertnagl/AI-Dojo/blob/main/LLM/beginner_tuning_example/fine_tune_llama3-2_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Installs & Imports

In [None]:
%pip install wandb -qU
%pip install huggingface-hub
%pip install trl
%pip install -U bitsandbytes

In [2]:
import os
import huggingface_hub
import wandb
import torch
import trl

## Verifying the environment

### Local

In [3]:
# # Check wandb token
# wandb_token = os.environ["WANDB_API_KEY"]

# # Check hf env
# hf_token = os.environ["HF_TOKEN"]
# os.environ["HF_HOME"]

### Colab

In [4]:
from google.colab import userdata
wandb_token = userdata.get('WANDB_API_KEY')
hf_token = userdata.get('HF_TOKEN')

### Wandb & HF

In [5]:
wandb.login(key=wandb_token)
huggingface_hub.login(hf_token)
wandb.init(project="applied-ai-lecture",)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchristophhoertnagl[0m ([33muncoverai[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Pipeline

## Load Dataset

In [6]:
from datasets import Dataset
dataset = Dataset.from_json("./chris_train.json")
print(dataset[0])
def add_conversation(sample):
    sample["conversation"] = [{"role": "user", "content": sample["instruction"]}, {"role": "assistant", "content": sample["output"]}]
    return sample
dataset = dataset.map(add_conversation, num_proc=os.cpu_count())
dataset[0]

{'instruction': 'What is your name?', 'output': 'My name is Chris.'}


{'instruction': 'What is your name?',
 'output': 'My name is Chris.',
 'conversation': [{'content': 'What is your name?', 'role': 'user'},
  {'content': 'My name is Chris.', 'role': 'assistant'}]}

## Load Model

In [7]:
from transformers import BitsAndBytesConfig

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=bnb_config, device_map="auto", trust_remote_code=True)
model

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-0

In [16]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

model = get_peft_model(model, peft_config)

### Test Tokenizer

In [10]:
print(tokenizer.apply_chat_template(conversation=dataset[0]["conversation"], tokenize=False))
print(tokenizer.apply_chat_template(conversation=dataset[0]["conversation"]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Dec 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is your name?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

My name is Chris.<|eot_id|>
[128000, 128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1721, 3799, 220, 2366, 19, 271, 128009, 128006, 882, 128007, 271, 3923, 374, 701, 836, 30, 128009, 128006, 78191, 128007, 271, 5159, 836, 374, 11517, 13, 128009]


### Test Model

In [11]:
# cut off last message since that is the expected model response
conversation = dataset[0]["conversation"][:-1]
response = model.generate(tokenizer.apply_chat_template(conversation=conversation, add_generation_prompt=True, return_tensors="pt").to(model.device), max_length=100)
print(tokenizer.decode(response[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Dec 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

What is your name?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."<|eot_id|>


## Prepare tokenizer & dataset

In [12]:
def prepare_sample(sample):
    sample["len"] = len(tokenizer.apply_chat_template(conversation=sample["conversation"]))
    sample["text"] = tokenizer.apply_chat_template(conversation=sample["conversation"], tokenize=False)
    return sample

dataset = dataset.map(prepare_sample, num_proc=os.cpu_count())

Map (num_proc=2):   0%|          | 0/128 [00:00<?, ? examples/s]

In [13]:
print(tokenizer.model_max_length)
print(max(dataset["len"]))
tokenizer.model_max_length = max(dataset["len"])

131072
56


# Define Training

In [14]:
from transformers import TrainingArguments

args = TrainingArguments(
    # other args and kwargs here
    output_dir="./tuned_model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    report_to="wandb",  # enable logging to W&B
    run_name="lecture-test-run",  # name of the W&B run (optional)
    logging_steps=1,  # how often to log to W&B
)

In [17]:
from trl import SFTTrainer

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=args
)



# Run Training

In [18]:
trainer.train()

Step,Training Loss
1,5.2869
2,4.5054
3,3.9334
4,3.5335
5,3.1751
6,2.8176
7,2.3221
8,2.1543
9,1.8741
10,2.0021


TrainOutput(global_step=128, training_loss=1.0772858546115458, metrics={'train_runtime': 30.4432, 'train_samples_per_second': 4.205, 'train_steps_per_second': 4.205, 'total_flos': 37529963421696.0, 'train_loss': 1.0772858546115458, 'epoch': 1.0})

## Check Tuned Model

In [19]:
model.eval()
# cut off last message since that is the expected model response
conversation = dataset[5]["conversation"][:-1]
response = model.generate(tokenizer.apply_chat_template(conversation=conversation, add_generation_prompt=True, return_tensors="pt").to(model.device), max_length=100)
print(tokenizer.decode(response[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 01 Dec 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Explain your name.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

My name is Chris. Chris is short for Christopher. I am a knowledgeable assistant, here to help answer any questions you may have.<|eot_id|>
