# Fine-tuning LLM on Your Own Dataset with QLoRA on a Single GPU

Fine-tune the LLM base model on a custom dataset. We'll use the QLoRa technique to train an LLM.

In [1]:
!python3 -m venv .venv
!source .venv/bin/activate

In [2]:
!nvidia-smi

Sat Mar  9 22:56:59 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 546.01       CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  Off |
|  0%   41C    P8    22W / 450W |  16586MiB / 24564MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
!pip install -qqq fastembed=="0.2.1" --progress-bar off
!pip install -qqq tokenizers=="0.15.2" --progress-bar off
!pip install -qqq loguru=="0.7.2" --progress-bar off
!pip install -qqq tqdm=="4.66.0" --progress-bar off
!pip install -qqq scikit-learn --progress-bar off


In [4]:
# !pip install ipywidgets
!pip install -qqq torch --progress-bar off
!pip install -qqq transformers==4.37.0 --progress-bar off
!pip install -qqq datasets==2.14.4 --progress-bar off
!pip install -qqq peft==0.8.2 --progress-bar off
!pip install -qqq bitsandbytes==0.42.0 --progress-bar off
!pip install -qqq trl==0.7.4 --progress-bar off

# Import Dependencies

In [11]:
import os
import re
import json
import torch
import pandas as pd
import bitsandbytes as bnb

from datasets import Dataset, DatasetDict
from huggingface_hub import login
from sklearn.model_selection import train_test_split
from pprint import pprint
from trl import SFTTrainer
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    AutoPeftModelForCausalLM
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)



In [12]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"using {DEVICE} device")

using cuda:0 device


In [13]:
# Training prompt for instruction finetuning using '###' format
DEFAULT_SYSTEM_PROMPT = """ คุณเป็นนักเรียนที่คอยตอบคำถาม โดยข้างล่างต่อไปนี้คือคำถาม จงตอบคำถามต่อไปนี้ """


###------------------------------ Process Dataset ------------------------------###
def generate_training_prompt(
    question: str, answer: str,
    system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### คำสั่ง:
{system_prompt.strip()}

### คำถาม:
{question.strip()}

### คำตอบ:
{answer.strip()}
""".strip()


def process_dataset(data: pd.DataFrame):
    data["text"] = data.apply(
        lambda row: generate_training_prompt(
            row["question"], row["answer"]
        ), axis=1
    )
    return data

In [9]:
login(token="YOUR_HF_TOKEN")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.


Token is valid (permission: read).
Your token has been saved to /home/katopz/.cache/huggingface/token
Login successful


# Loading the model

Define the [Lora Config](https://huggingface.co/docs/peft/main/en/package_reference/tuners#peft.LoraConfig) with:

- `task_type`, token classification **(TaskType.TOKEN_CLS)**
- `r`, the dimension of the low-rank matrices
- `lora_alpha`, scaling factor for the weight matrices
- `lora_dropout`, dropout probability of the LoRA layers
- `bias`, set to **all** to train all bias parameters

In [18]:
model_id = "SeaLLMs/SeaLLM-7B-v2"  # @param ["pythainlp/wangchanglm-7.5B-sft-enth-sharded", "TinyPixel/Llama-2-7B-bf16-sharded", "SeaLLMs/SeaLLM-7B-Chat"]

# Load both LLM model and tokenizer
def load_LLM_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens = False
    tokenizer.padding_side = "right"
    
    return model, tokenizer

In [19]:
model, tokenizer = load_LLM_and_tokenizer()
model.config.use_cache = False

# Dataset Processing

In [21]:
law_dataset = pd.read_csv("./test.csv")
law_dataset.head()

Unnamed: 0,sysid,question,answer
0,640696,ชื่ออะไร,ชื่อต๊อบ
1,640696,ต๊อบชอบกินอะไร,ถั่ว
2,640696,ต๊อบชอบไปไหน,ไปเที่ยว
3,320422,ต๊อบชอบสีอะไร,เทา
4,320422,ต๊อบสูงเท่าไหร่,173


In [22]:
law_dataset.shape

(99, 3)

In [23]:
# TODO: limit first 1000 rows for lower resource
law_dataset = law_dataset[:100]
law_dataset.shape

(99, 3)

In [24]:
# Specify the proportions for train, validation, and test sets
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

# Split the dataset
train_data, temp_data = train_test_split(law_dataset, test_size=1 - train_ratio, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42)

# Print the shapes of the resulting DataFrames
print("Train set shape:", train_data.shape)
print("Validation set shape:", validation_data.shape)
print("Test set shape:", test_data.shape)

Train set shape: (79, 3)
Validation set shape: (10, 3)
Test set shape: (10, 3)


In [25]:
train_data = process_dataset(train_data)
validation_data = process_dataset(validation_data)

In [26]:
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)

combine_dataset = DatasetDict()
combine_dataset['train'] = train_dataset
combine_dataset['validation'] = validation_dataset

In [27]:
combine_dataset

DatasetDict({
    train: Dataset({
        features: ['sysid', 'question', 'answer', 'text', '__index_level_0__'],
        num_rows: 79
    })
    validation: Dataset({
        features: ['sysid', 'question', 'answer', 'text', '__index_level_0__'],
        num_rows: 10
    })
})

# Inference with Base Model

In [28]:
INFERENCE_SYSTEM_PROMPT = """ ข้างล่างต่อไปนี้คือคำถาม จงตอบคำถามต่อไปนี้ """

max_new_tokens = 256    # @param {type: "integer"}
temperature = 0.0001    # @param {type: "number"}

def generate_answer(model, text: str):
    batch = tokenizer(text, return_tensors="pt")

    with torch.cuda.amp.autocast():
        output_tokens = model.generate(
            input_ids=batch["input_ids"].to(DEVICE),
            max_new_tokens=max_new_tokens,
            no_repeat_ngram_size=2,
            typical_p=1.,
            temperature=temperature,
            do_sample=True,
            attention_mask=batch["attention_mask"],
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_tokens[0][len(batch["input_ids"][0]):], skip_special_tokens=True)


def generate_inference_prompt(
    question: str,
    system_prompt: str = INFERENCE_SYSTEM_PROMPT
) -> str:
    return f"""### คำสั่ง:
{system_prompt.strip()}

### คำถาม:
{question.strip()}

### คำตอบ:
""".strip()


examples = []
for index, data_point in test_data.head(5).iterrows():
    question = data_point["question"]
    label = data_point["answer"]
    examples.append({
        "question": question,
        "response": label,
        "text": generate_inference_prompt(question)
    })

test_df = pd.DataFrame(examples)
test_df.head()

Unnamed: 0,question,response,text
0,ต๊อบชอบไปไหน,ไปเที่ยว,### คำสั่ง:\nข้างล่างต่อไปนี้คือคำถาม จงตอบคำถ...
1,ต๊อบชอบไปไหน,ไปเที่ยว,### คำสั่ง:\nข้างล่างต่อไปนี้คือคำถาม จงตอบคำถ...
2,ต๊อบสูงเท่าไหร่,173,### คำสั่ง:\nข้างล่างต่อไปนี้คือคำถาม จงตอบคำถ...
3,ชื่ออะไร,ชื่อต๊อบ,### คำสั่ง:\nข้างล่างต่อไปนี้คือคำถาม จงตอบคำถ...
4,ชื่ออะไร,ชื่อต๊อบ,### คำสั่ง:\nข้างล่างต่อไปนี้คือคำถาม จงตอบคำถ...


#### Example 1

In [32]:
example = test_df.iloc[3]
print(example.question)

ชื่ออะไร


In [33]:
%%time
response = generate_answer(model, example.text)

CPU times: user 141 ms, sys: 152 ms, total: 293 ms
Wall time: 293 ms


In [34]:
pprint(response)

''


In [35]:
pprint(example.response)

'ชื่อต๊อบ'


# Finetune QLoRA

Here we will use the `SFTTrainer` from [TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [36]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(48384, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [37]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 'load_in_8bit': False,
 'load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': False,
 'bnb_4bit_compute_dtype': 'float16'}

In [38]:
lora_r = 8                  # @param {type:"integer"}
lora_alpha = 32             # @param {type:"integer"}
lora_dropout = 0.05         # @param {type:"number"}
bias = "none"               # @param ["all", "none"]

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=bias,
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)

In [39]:
# @markdown ### Enter a file path:
OUTPUT_DIR = "experiments"          # @param {type:"string"}

# @markdown ---
per_device_train_batch_size = 4     # @param {type:"integer"}
gradient_accumulation_steps = 4     # @param {type:"integer"}
optim = "paged_adamw_32bit"         # @param {type:"string"}
logging_steps = 10                  # @param {type:"integer"}
learning_rate = 1e-4                # @param {type:"number"}
max_grad_norm = 0.3                 # @param {type:"number"}
num_train_epochs = 5                # @param {type:"integer"}
evaluation_strategy = "steps"       # @param {type:"string"}
eval_steps = 0.2                    # @param {type:"number"}
warmup_ratio = 0.05                 # @param {type:"number"}
save_strategy = "epoch"             # @param {type:"string"}
lr_scheduler_type = "cosine"        # @param {type:"string"}
fp16=True                           # @param {type:"boolean"}
group_by_length = True              # @param {type:"boolean"}
save_safetensors = True             # @param {type:"boolean"}

In [40]:
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=fp16,
    max_grad_norm=max_grad_norm,
    num_train_epochs=num_train_epochs,
    evaluation_strategy=evaluation_strategy,
    eval_steps=eval_steps,
    warmup_ratio=warmup_ratio,
    save_strategy=save_strategy,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
    seed=42
)

In [41]:
trainer = SFTTrainer(
    model=model,
    train_dataset=combine_dataset["train"],
    eval_dataset=combine_dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map: 100%|██████████| 79/79 [00:00<00:00, 5836.92 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 4538.80 examples/s]


In [42]:
trainer.train()

Step,Training Loss,Validation Loss
5,No log,1.292435


ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: e3a1a168-0848-4b59-821b-26ca7f0650c4)')

In [53]:
trainer.save_model()

In [None]:
#!zip -r /kaggle/working/experiments.zip /kaggle/working/experiments

# Inference with Adapter

In [54]:
KAGGLE_OUTPUT_DIR = "./experiments"
model = PeftModel.from_pretrained(model, KAGGLE_OUTPUT_DIR)

#### Example 1

In [103]:
example = test_df.iloc[0]
print(example.question)

ต๊อบชอบไปไหน


In [100]:
%%time
response = generate_answer(model, example.text)

CPU times: user 111 ms, sys: 243 ms, total: 354 ms
Wall time: 353 ms


In [101]:
pprint(response)

''


In [102]:
pprint(example.response)

'ชื่อต๊อบ'


#### Example 2

In [104]:
example = test_df.iloc[3]
print(example.question)

ชื่ออะไร


In [105]:
%%time
response = generate_answer(model, example.text)

CPU times: user 118 ms, sys: 238 ms, total: 356 ms
Wall time: 355 ms


In [106]:
pprint(response)

''


In [107]:
pprint(example.response)

'ชื่อต๊อบ'


#### Example 3

In [108]:
example = test_df.iloc[2]
print(example.question)

ต๊อบสูงเท่าไหร่


In [109]:
%%time
response = generate_answer(model, example.text)

CPU times: user 99.4 ms, sys: 17 ms, total: 116 ms
Wall time: 116 ms


In [110]:
pprint(response)

''


In [111]:
pprint(example.response)

'173'


#### General Questions

In [112]:
%%time
question = "ชื่ออะไร"
response = generate_answer(model, question)
pprint(response)

('\n'
 'ต๊อบชอบอะไรบ้าง\n'
 '\n'
 '### ต๊อกชอบกินอะไรมากที่สุด\n'
 ' ตอบ\n'
 'ถั่ว\n'
 'น้ำ\n'
 'อะไรตอกกินมาก\n'
 'คำตอบคือต็อก\n'
 'คือถ๊วย\n'
 'ชื่อต๋๊ง\n'
 'ชอบถ้วยต้ม\n'
 'กินตับตอย\n'
 'ไปตากา\n'
 'ต่อไปต๊ะตอง\n'
 'ข้างต้านต้า\n'
 'ตามตึ๊กตี้\n'
 'ต่อต้วนตำติน\n'
 'ดังต้อตอ\n'
 'ก่อนตางตี\n'
 'หลังติงต้ง\n'
 'ซ้อนตงตุมต\n'
 'ไถตถงถ\n'
 'ตีตต้อยตาตายตฺตี่ตอนตือตูตานตาสตั๊ตติตรอต้อน\n'
 'แต่ตลกติ่งตอดตวน\n'
 'แต่อ้ตค๊อตโตตาร\n'
 'ไม่ตกลงตัวต่าต้างต่านตู้ติตุนตื่ตูกต่ายต่าวต้าวตาวตวงตามตุกตอลตัสต้องตังตอบติบตรตรีตราตริตรมตมตวะตุตักตอมตํ')
CPU times: user 10.1 s, sys: 1.1 s, total: 11.3 s
Wall time: 11.2 s


In [113]:
%%time
question = "ต๊อบสูงเท่าไหร่"
response = generate_answer(model, question)
pprint(response)

('\n'
 'คุณต๊อกสูง 180 เซนติเมตร\n'
 'ต็อบต๋อยสูงแค่ไหน\n'
 'เราตอกต๊ะสูง\n'
 '\n'
 'คำตอบคือ: 2\n'
 'คือต้อต้อกระเป๋าต้งตากตางตงตองตอตตุมตอนตานตา')
CPU times: user 3.07 s, sys: 534 ms, total: 3.6 s
Wall time: 3.6 s
