In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from trl import (
    DPOConfig,
    DPOTrainer,
    ModelConfig,
    ScriptArguments,
    TrlParser,
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from peft import LoraConfig, get_peft_model
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import matplotlib.pyplot as plt
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datetime import datetime
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_trainable_parameters(model):
        """
        Prints the number of trainable parameters in the model.
        """
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
        )

In [6]:
def prepare_model_for_LoRA_training(model, accelerator):
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=32,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, config, adapter_name="training_adapter")
    print_trainable_parameters(model)

    # Apply the accelerator. You can comment this out to remove the accelerator.
    model = accelerator.prepare_model(model)
    return model

def get_quant_model(model_name):
    bnb_config = BitsAndBytesConfig(load_in_4bit=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
    return model

In [7]:
def get_accelerator():
    fsdp_plugin = FullyShardedDataParallelPlugin(
        state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
        optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
    )

    accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
    return accelerator

In [8]:
def get_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        add_eos_token=True,
        add_bos_token=True,
        use_fast=False
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
    return tokenizer

In [9]:
accelerator = get_accelerator()
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = get_quant_model(model_name)

# model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = get_tokenizer(model_name)

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


In [10]:
dataset = load_dataset("json", data_files="data/dataset-max-feature-difference-top-10_iter_1.jsonl", split="train")

In [11]:
model = prepare_model_for_LoRA_training(model, accelerator)


trainable params: 88121344 || all params: 4628721664 || trainable%: 1.9037944036550305


In [12]:
"""config = LoraConfig(
        r=32,
        lora_alpha=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional
        task_type="CAUSAL_LM",
    )

model.add_adapter(config, adapter_name="reference_adapter")"""

'config = LoraConfig(\n        r=32,\n        lora_alpha=16,\n        target_modules=[\n            "q_proj",\n            "k_proj",\n            "v_proj",\n            "o_proj",\n            "gate_proj",\n            "up_proj",\n            "down_proj",\n            "lm_head",\n        ],\n        bias="none",\n        lora_dropout=0.05,  # Conventional\n        task_type="CAUSAL_LM",\n    )\n\nmodel.add_adapter(config, adapter_name="reference_adapter")'

In [14]:
# Define the DPOConfig with your training parameters
training_config = DPOConfig(
    output_dir="model",
    warmup_steps=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5.0e-6,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_dir="model/logs",
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    do_eval=True,
    num_train_epochs=1,
    run_name=f"test-llama{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
)

# Initialize the DPOTrainer with the DPOConfig
trainer = DPOTrainer(
    model=model,
    args=training_config,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

  return fn(*args, **kwargs)
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss




TrainOutput(global_step=23, training_loss=0.5216325262318486, metrics={'train_runtime': 496.9279, 'train_samples_per_second': 1.477, 'train_steps_per_second': 0.046, 'total_flos': 0.0, 'train_loss': 0.5216325262318486, 'epoch': 1.0})