# Fine-tuning with SFT on SeaLLM-7B-v2
SFT uUsing transformers's chat_template

## 1️⃣ Setup

In [1]:
!python3 -m venv .venv
!source .venv/bin/activate

In [2]:
!nvidia-smi

Sun Mar 10 10:08:02 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 546.01       CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   41C    P8    23W / 450W |    310MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install ipywidgets

In [3]:
!pip install transformers torch



## 2️⃣ Load base model
We will use [SeaLLMs/SeaLLM-7B-v2](https://huggingface.co/SeaLLMs/SeaLLM-7B-v2). This will take around 1-2 minutes on `RTX4090`.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

MODEL_NAME = "SeaLLMs/SeaLLM-7B-v2"
DEVICE = "cuda:0" # the device to load the model onto // auto, cuda:0

# use bfloat16 to ensure the best performance.
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map=DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model.to(DEVICE)

## 3️⃣ Infer with base model
We will use `AutoModelForCausalLM` with transformers's chat template.

In [3]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "สวัสดี"},
    {"role": "assistant", "content": "สวัสดี! สบายดีไหม?"},
    {"role": "user", "content": "สอนเขียน helloworld ด้วย Rust หน่อย."}
]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
print(tokenizer.convert_ids_to_tokens(encodeds[0]))

model_inputs = encodeds.to(DEVICE)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.pad_token_id)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])


['<s>', '▁<', '|', 'im', '_', 'start', '|', '>', 'system', '<0x0A>', 'You', '▁are', '▁a', '▁helpful', '▁assistant', '.', '</s>', '▁<', '|', 'im', '_', 'start', '|', '>', 'user', '<0x0A>', 'ส', 'ว', 'ัส', 'ดี', '</s>', '▁<', '|', 'im', '_', 'start', '|', '>', 'ass', 'istant', '<0x0A>', 'ส', 'ว', 'ัส', 'ดี', '!', '▁ส', 'บ', 'าย', 'ดี', 'ไหม', '?', '</s>', '▁<', '|', 'im', '_', 'start', '|', '>', 'user', '<0x0A>', 'สอน', 'เขียน', '▁hell', 'ow', 'orld', '▁ด้วย', '▁R', 'ust', '▁', 'หน่อย', '.', '</s>', '▁<', '|', 'im', '_', 'start', '|', '>', 'ass', 'istant', '<0x0A>']
<s> <|im_start|>system
You are a helpful assistant.</s> <|im_start|>user
สวัสดี</s> <|im_start|>assistant
สวัสดี! สบายดีไหม?</s> <|im_start|>user
สอนเขียน helloworld ด้วย Rust หน่อย.</s> <|im_start|>assistant
แน่นอนครับ! การเขียนโปรแกรม Hello World ด้วยภาษา Rust นั้นค่อนข้างง่าย และเป็นตัวอย่างที่ดีที่จะเริ่มต้นเรียนรู้ภาษานี้ นี่คือตัวอย่างของโปรแกรม Hello World ในภาษา Rust:

```rust
fn main() {
    println!("Hello, world!")

In [4]:
# Empty VRAM
import gc

del model
del tokenizer

gc.collect()

20

## 4️⃣ Dataset

In [5]:
!pip install accelerate peft bitsandbytes transformers trl datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [2]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import bitsandbytes as bnb
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import pandas as pd
from trl import SFTTrainer

In [39]:
df = pd.read_csv('./train.csv')
df = df.dropna()

df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 160
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 40
    })
})

In [40]:
def transform_conversation(example):
    conversation_text = example['question']
    conversation_answer = example['answer']

    reformatted_segments = []
    
    if conversation_answer:
        reformatted_segments.append(f'<s>[INST] {conversation_text} [/INST] {conversation_answer} </s>')
        
    else:
        reformatted_segments.append(f'<s>[INST] {conversation_text} [/INST] </s>')
    return {'text': ''.join(reformatted_segments)}

In [41]:
transformed_dataset = dataset_dict.map(transform_conversation)
transformed_dataset

transformed_dataset['train'][0]

Map: 100%|██████████| 160/160 [00:00<00:00, 12293.70 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 18094.50 examples/s]


{'question': 'katopz อยู่ที่ไหน?',
 'answer': 'katopz อยู่กรุงเทพ',
 'text': '<s>[INST] katopz อยู่ที่ไหน? [/INST] katopz อยู่กรุงเทพ </s>'}

## 5️⃣ Load Model

In [42]:
!pip install python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




in the `.env` file:
```bash
HF_TOKEN="foobar"
```

In [43]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [25]:
login(token=os.getenv('HF_TOKEN'))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/katopz/.cache/huggingface/token
Login successful


In [44]:
OUTPUT_DIR = "./lora"
MODEL_NAME = "SeaLLMs/SeaLLM-7B-v2"
DEVICE = "cuda:0" # the device to load the model onto // auto, cuda:0

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map=DEVICE,
    trust_remote_code=True,
    quantization_config=quant_config,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [45]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [46]:
model = prepare_model_for_kbit_training(model)
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(48384, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [47]:
peft_params = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 0 || all params: 3886288896 || trainable%: 0.0


In [48]:
training_params = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=0.2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    optim="paged_adamw_32bit",
    save_steps=25,
    evaluation_strategy="epoch",
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    max_steps=5,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [49]:
torch.cuda.empty_cache()

In [50]:
trainer = SFTTrainer(
    model=model,
    train_dataset=transformed_dataset['train'],
    peft_config=peft_params,
    eval_dataset=transformed_dataset['test'],
    args=training_params,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=False,
)
model.config.use_cache = False
trainer.train()

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map: 100%|██████████| 160/160 [00:00<00:00, 7511.96 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 9890.48 examples/s]


Epoch,Training Loss,Validation Loss
0,No log,3.519044
2,No log,1.905147


TrainOutput(global_step=5, training_loss=3.435215377807617, metrics={'train_runtime': 243.5388, 'train_samples_per_second': 1.314, 'train_steps_per_second': 0.021, 'total_flos': 396745657614336.0, 'train_loss': 3.435215377807617, 'epoch': 2.0})

In [51]:
trainer.save_model()

ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 563940a4-0d0e-4a25-8c0d-047bb9615974)')

## 6️⃣ Inference with Adapter

In [None]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline 

# Path to saved peft adapter model
peft_model_id = OUTPUT_DIR

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map=DEVICE,
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [52]:
prompts = ["Who is katopz?"]

In [53]:
for prompt in prompts:
  messages = pipe.tokenizer.apply_chat_template([{"role":"user", "content": prompt}], tokenize=False)
  outputs = pipe(prompt, max_new_tokens=2048, do_sample=True, temperature=1.0, top_k=50, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
  print(f"**Prompt**:\n{prompt}\n")
  print(f"**Generated Answer**:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
  print("===" * 10)

**Prompt**:
Who is katopz?

**Generated Answer**:
zrb_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\1_\_\_\_\_\_\_\_\_\_\_\1_\_\_\_\_\_\_\1_\_\_\_\_\_\_\_\_\_\1_\_\_\_\_\_\_\_\_\_\_\_\_(_\_\_\_\_\_\_\_\_\_\_\n_\_\_\_\_\_\_\_\_\_\_\


In [None]:
for prompt in prompts:
  messages = pipe.tokenizer.apply_chat_template([{"role":"user", "content": prompt}], tokenize=False)
  outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.1, top_k=20, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
  print(f"**Prompt**:\n{prompt}\n")
  print(f"**Generated Answer**:\n{outputs[0]['generated_text'][len(prompt):].strip()}")
  print("===" * 10)

**Prompt**:
katopz เกิดวันอะไร?

**Generated Answer**:
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
