# ENVIRONMENT

In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "Qwen/Qwen2-1.5B"
print(f"Using model: {model_name}")

Using device: cuda
Using model: Qwen/Qwen2-1.5B


# LOADING DATA

In [16]:
from datasets import load_dataset

data_filename = "./data/sample_data.jsonl"

dataset = load_dataset("json", data_files=data_filename)

print(f"First sample of dataset: \n {dataset["train"][0]}")

# Split dataset
split_datasets = dataset['train'].train_test_split(test_size=0.1)

First sample of dataset: 
 {'messages': [{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}, {'role': 'user', 'content': 'Có thể sử dụng số điện thoại khác để đăng nhập không?'}, {'role': 'assistant', 'content': 'Có, nhưng số điện thoại đó phải được liên kết với tài khoản của bạn trước đó. Nếu số chưa được liên kết, bạn sẽ không thể sử dụng nó để đăng nhập.'}]}


# TOKENIZE

In [17]:
SYSTEM_PROMPT = "You are an useful assistant."

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [19]:

def formatting_prompts_func(examples):
    formatted_data = {"input_ids": [], "attention_mask": [], "labels": []}  
    
    for i in range(len(examples['messages'])):     
        messages = examples['messages'][i]
        
        # Format the chat template for this example
        formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False)
        
        # print(f"Here's some formatted chat: {formatted_chat}\n")
        # Tokenize the formatted chat
        encoding = tokenizer(
            formatted_chat,
            padding="max_length",
            truncation=True,
            max_length=240,
            return_tensors="pt",
        )
        
        
        # Add the tokenized data to the output containers
        formatted_data["input_ids"].append(encoding["input_ids"].squeeze(0).tolist())
        formatted_data["attention_mask"].append(encoding["attention_mask"].squeeze(0).tolist())
        formatted_data["labels"].append(encoding["input_ids"].squeeze(0).tolist())  # Use input_ids as labels for causal LM
    
    return formatted_data

In [28]:
print(split_datasets['train']['messages'][15])

[{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}, {'role': 'user', 'content': 'Tôi có thể xem danh sách người thụ hưởng đã thêm ở đâu?'}, {'role': 'assistant', 'content': 'Vào mục "Quản lý danh bạ thụ hưởng" hoặc "Danh sách người thụ hưởng" trong ứng dụng để xem danh sách đã lưu.'}]


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenized_dataset = split_datasets.map(formatting_prompts_func, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print(tokenized_dataset["train"])

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

{'messages': [{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}, {'role': 'user', 'content': 'Mật khẩu mới có thể giống với mật khẩu cũ không?'}, {'role': 'assistant', 'content': 'Thông thường, không được phép để đảm bảo tính bảo mật. Nếu bạn cố gắng đặt lại, hệ thống sẽ nhắc bạn chọn mật khẩu khác với mật khẩu cũ.'}], 'input_ids': [151644, 8948, 198, 2610, 525, 458, 17847, 369, 3405, 12, 596, 86, 4671, 9079, 9046, 279, 2701, 9666, 315, 30403, 2266, 311, 4226, 279, 3405, 32901, 1052, 594, 902, 5435, 2266, 11, 1101, 4226, 448, 697, 2331, 6540, 9046, 2326, 22870, 7192, 323, 2506, 279, 4226, 63594, 13, 151645, 198, 151644, 872, 198, 44, 52885, 97610, 92105, 28776, 128254, 130469, 128250, 130821, 97610, 133138, 53037, 30, 151645, 198, 151644, 77091, 198, 98041, 12845

In [21]:
print(split_datasets)

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 89
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 10
    })
})


In [22]:
print(split_datasets['train']['messages'])

[[{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}, {'role': 'user', 'content': 'Mật khẩu mới có thể giống với mật khẩu cũ không?'}, {'role': 'assistant', 'content': 'Thông thường, không được phép để đảm bảo tính bảo mật. Nếu bạn cố gắng đặt lại, hệ thống sẽ nhắc bạn chọn mật khẩu khác với mật khẩu cũ.'}], [{'role': 'system', 'content': "You are an assistant for question-answering tasks.Use the following pieces of retrieved context to answer the question.If there's no related context, just answer with your base knowledge.Use three sentences maximum and keep the answer concise."}, {'role': 'user', 'content': 'Làm thế nào để biết vé xe đã đặt thành công?'}, {'role': 'assistant', 'content': 'Sau khi đặt vé, bạn sẽ nhận được thông báo "Đặt vé thành công" kèm mã vé. 

# MODEL

In [23]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             quantization_config=quantization_config, 
                                             torch_dtype="auto") 

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
print(f"Model specification: \n {model}")

Model specification: 
 Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear8bitLt(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear8bitLt(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear8bitLt(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear8bitLt(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear8bitLt(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear8bitLt(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )


# LORA

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,          
    lora_alpha=32,    
    target_modules=["q_proj"], 
    lora_dropout=0.05, 
    task_type="CAUSAL_LM" 
)

# model.unload()
model = get_peft_model(model, lora_config)


print("Number of trainable parameter:")
model.print_trainable_parameters()  

Number of trainable parameter:
trainable params: 1,376,256 || all params: 1,545,090,560 || trainable%: 0.0891


# TRAINING

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/home/yosakoi/Work/chatbot/model/LLM/output",
    logging_dir="/home/yosakoi/Work/chatbot/log/llm.log",
    
    num_train_epochs=5,
    learning_rate=5e-4,
    
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    
    save_strategy="epoch",
    evaluation_strategy="steps",
    logging_steps=10,
    
    gradient_checkpointing=True,
    remove_unused_columns=False,
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model.to(device),
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

[2025-03-25 10:30:29,697] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/yosakoi/miniconda3/envs/llm/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/yosakoi/miniconda3/envs/llm/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# Save LoRA weights
model_engine.module.save_pretrained("path/to/save/model")

# Save tokenizer
tokenizer.save_pretrained("path/to/save/model")