In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, TaskType, get_peft_model
from datetime import datetime
from datasets import Dataset
import pandas as pd
import torch

for i in range(torch.cuda.device_count()):
    print(f"Device {i}:")
    print("  Name:", torch.cuda.get_device_name(i))
    print("  Memory allocated:", round(torch.cuda.memory_allocated(i)/1024**3, 2), "GB")
    print("  Memory reserved:", round(torch.cuda.memory_reserved(i)/1024**3, 2), "GB")

### Process Dataset

In [None]:
# util function to process dataset row by row
def process_dataset_row(row,
                        tokenizer,
                        system_prompt: str = "You are a helpful AI assistant. Answer questions according to the language used by the user.",
                        prompt_col_name: str = 'prompt',
                        response_col_name: str = 'safe_response'):
    # process prompt-response pair
    instruction = row[prompt_col_name]
    output = row[response_col_name]
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": instruction}
    ]
    new_input = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    new_output = new_input + output
    return new_input, new_output

# util function for tokenization
def tokenize_function(examples, 
                      tokenizer, 
                      truncation:bool = True, 
                      padding:str = "max_length", 
                      max_length:str = 256):
    tokenized_inputs = tokenizer(
        text=examples['input'],
        text_target=examples['output'],
        truncation=truncation,
        padding=padding,
        max_length=max_length
    )
    return tokenized_inputs

# wrapper function to load dataset into train-validation
def load_dataset(file_path,
                 tokenizer,
                 test_size:int = 0.2,
                 seed:int = 42):
    # load dataset to Dataset
    df = pd.read_excel(file_path)
    dataset = Dataset.from_pandas(df)

    # process dataset
    formatted_pairs = []
    for row in dataset:
        inp, out = process_dataset_row(row, tokenizer=tokenizer)
        formatted_pairs.append({"input":inp, "output":out})
    dataset = Dataset.from_list(formatted_pairs)
    tokenized_dataset = dataset.map(
        tokenize_function,
        fn_kwargs={'tokenizer': tokenizer}
    )
    
    # Split into train and validation sets
    split_datasets = tokenized_dataset.train_test_split(test_size=test_size, seed=seed)
    train_dataset = split_datasets['train']
    val_dataset = split_datasets['test']

    return train_dataset, val_dataset

### Train Model

In [None]:
# main function to train model
def train_model(model_name:str,
                dataset_file_path:str,
                train_config:dict = dict(),
                lora_config:dict = dict(),
                seed:int = 42
               ):
    # load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 device_map="auto",
                                                 torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # load LoRA
    peft_config = LoraConfig(**lora_config)
    model_lora = get_peft_model(model, peft_config)

    # load train and val instances
    train_dataset, val_dataset = load_dataset(file_path=dataset_file_path, tokenizer=tokenizer)

    # use data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # set output dir as model_name + timestamp
    sanitized_model_name = model_name.replace("/", "_")
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_dir_train = f"./finetuning/{sanitized_model_name}_{timestamp}/output"
    logging_dir_train = f"./finetuning/{sanitized_model_name}_{timestamp}/log"
    model_dir_train = f"./finetuning/{sanitized_model_name}_{timestamp}/model"
    
    # train model
    training_args = TrainingArguments(output_dir=output_dir_train,
                                      logging_dir=logging_dir_train,
                                      **train_config)
    
    trainer = Trainer(
        model=model_lora,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,
        data_collator=data_collator,
    )
    trainer.train()

    # save model
    model_lora.save_pretrained(model_dir_train)
    tokenizer.save_pretrained(model_dir_train)
    
    print(f"Training Complete. Output Saved to {model_dir_train}")

In [None]:
# set common params
dataset_file_path = "dataset/IndoSafety-Train.xlsx"

In [None]:
# EXAMPLE: Training "sail/Sailor2-8B-Chat"
model_name = "sail/Sailor2-8B-Chat"
train_config = {
    "overwrite_output_dir":True,
    "eval_strategy":"steps",
    "save_strategy":"steps",
    "logging_strategy":'steps',
    "eval_steps":20,
    "save_steps":20,
    "logging_steps":20,
    "learning_rate":1e-5,
    "per_device_train_batch_size":16,
    "per_device_eval_batch_size":16,
    "auto_find_batch_size":True,
    "ddp_find_unused_parameters":False,
    "bf16":True,
    "num_train_epochs":1,
    "save_total_limit":4,
    "report_to":["none"],
    "label_names":["labels"]
}
lora_config = {
    "task_type":TaskType.CAUSAL_LM,
    "inference_mode":False,
    "r":4,
    "lora_alpha":16,
    "lora_dropout":0,
    "bias":"none",
    "target_modules":[
        "q_proj", "k_proj","v_proj", "o_proj",
        "gate_proj","up_proj","down_proj"
    ]
}
train_model(model_name, dataset_file_path, train_config, lora_config)

In [None]:
# EXAMPLE: Training "SeaLLMs/SeaLLMs-v3-7B-Chat"
model_name = "SeaLLMs/SeaLLMs-v3-7B-Chat"
train_config = {
    "overwrite_output_dir":True,
    "eval_strategy":"steps",
    "save_strategy":"steps",
    "logging_strategy":'steps',
    "eval_steps":20,
    "save_steps":20,
    "logging_steps":20,
    "learning_rate":1e-4,
    "per_device_train_batch_size":16,
    "per_device_eval_batch_size":16,
    "auto_find_batch_size":True,
    "ddp_find_unused_parameters":False,
    "bf16":True,
    "num_train_epochs":1,
    "save_total_limit":4,
    "report_to":["none"],
    "label_names":["labels"]
}
lora_config = {
    "task_type":TaskType.CAUSAL_LM,
    "inference_mode":False,
    "r":4,
    "lora_alpha":16,
    "lora_dropout":0,
    "bias":"none",
    "target_modules":[
        "q_proj", "k_proj","v_proj", "o_proj",
        "gate_proj","up_proj","down_proj"
    ]
}
train_model(model_name, dataset_file_path, train_config, lora_config)

In [None]:
# EXAMPLE: Training "aisingapore/Llama-SEA-LION-v3-8B-IT"
model_name = "aisingapore/Llama-SEA-LION-v3-8B-IT"
train_config = {
    "overwrite_output_dir":True,
    "eval_strategy":"steps",
    "save_strategy":"steps",
    "logging_strategy":'steps',
    "eval_steps":20,
    "save_steps":20,
    "logging_steps":20,
    "learning_rate":4e-5,
    "per_device_train_batch_size":16,
    "per_device_eval_batch_size":16,
    "auto_find_batch_size":True,
    "ddp_find_unused_parameters":False,
    "bf16":True,
    "num_train_epochs":1,
    "save_total_limit":4,
    "report_to":["none"],
    "label_names":["labels"]
}
lora_config = {
    "task_type":TaskType.CAUSAL_LM,
    "inference_mode":False,
    "r":4,
    "lora_alpha":16,
    "lora_dropout":0,
    "bias":"none",
    "target_modules":[
        "q_proj", "k_proj","v_proj", "o_proj",
        "gate_proj","up_proj","down_proj"
    ]
}
train_model(model_name, dataset_file_path, train_config, lora_config)

### Sample for low end GPU

In [None]:
# # EXAMPLE: Training "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# train_config = {
#     "overwrite_output_dir":True,
#     "eval_strategy":"steps",
#     "save_strategy":"steps",
#     "logging_strategy":'steps',
#     "eval_steps":20,
#     "save_steps":20,
#     "logging_steps":20,
#     "learning_rate":1e-5,
#     "per_device_train_batch_size":1,
#     "per_device_eval_batch_size":1,
#     "auto_find_batch_size":True,
#     "ddp_find_unused_parameters":False,
#     "bf16":True,
#     "max_steps": 50,  # Limit training to 50 steps
#     "save_total_limit":4,
#     "report_to":["none"],
#     "label_names":["labels"]
# }
# lora_config = {
#     "task_type":TaskType.CAUSAL_LM,
#     "inference_mode":False,
#     "r":4,
#     "lora_alpha":16,
#     "lora_dropout":0,
#     "bias":"none",
#     "target_modules":[
#         "q_proj", "k_proj","v_proj", "o_proj",
#         "gate_proj","up_proj","down_proj"
#     ]
# }
# train_model(model_name, dataset_file_path, train_config, lora_config)

### Monitor GPU Usage (Optional)

In [None]:
import torch
for i in range(torch.cuda.device_count()):
    device_name = torch.cuda.get_device_name(i)
    total_memory = round(torch.cuda.get_device_properties(i).total_memory / 1024**3, 2)
    memory_allocated = round(torch.cuda.memory_allocated(i) / 1024**3, 2)
    memory_reserved = round(torch.cuda.memory_reserved(i) / 1024**3, 2)
    max_memory_allocated = round(torch.cuda.max_memory_allocated(i) / 1024**3, 2)
    max_memory_reserved = round(torch.cuda.max_memory_reserved(i) / 1024**3, 2)

    print(f"Device {i}: {device_name}")
    print(f"  Total memory:         {total_memory} GB")
    print(f"  Memory allocated:     {memory_allocated} GB")
    print(f"  Max memory allocated: {max_memory_allocated} GB")
    print(f"  Memory reserved:      {memory_reserved} GB")
    print(f"  Max memory reserved:  {max_memory_reserved} GB")