# Direct Preference Optimization (DPO) Fine-Tuning of Llama Model
This notebook demonstrates how to fine-tune a Llama model using the Direct Preference Optimization (DPO) technique with the `jondurbin/truthy-dpo-v0.1` dataset. The code is encapsulated within a class structure for modularity and reusability.

In [1]:
!pip install transformers trl datasets bitsandbytes


Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadat

In [2]:
import torch
from dataclasses import dataclass, field
from typing import Optional
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import DPOTrainer
from datasets import load_dataset
import bitsandbytes as bnb
print("bitsandbytes version:", bnb.__version__)


bitsandbytes version: 0.44.1


In [3]:
@dataclass
class DPOTrainingConfig:
    torch_dtype: torch.dtype = torch.float
    ignore_bias_buffers: bool = field(
        default=False,
        metadata={
            "help": "Fix for DDP issues with LM bias/mask buffers - invalid scalar type, inplace operation. See Hugging Face issues"
        }
    )
    per_device_train_batch_size: int = field(default=4, metadata={"help": "train batch size per device"})
    per_device_eval_batch_size: int = field(default=1, metadata={"help": "eval batch size per device"})
    gradient_accumulation_steps: int = field(default=4, metadata={"help": "number of gradient accumulation steps"})
    gradient_checkpointing: bool = field(default=True, metadata={"help": "use gradient checkpointing"})
    gradient_checkpointing_use_reentrant: bool = field(default=False, metadata={"help": "use reentrant for gradient checkpointing"})

    # LoRA Parameters
    lora_alpha: float = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: float = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: int = field(default=8, metadata={"help": "the lora r parameter"})

    # Other Parameters
    max_prompt_length: int = field(default=512, metadata={"help": "maximum prompt length"})
    max_length: int = field(default=1024, metadata={"help": "maximum sequence length"})
    max_steps: int = field(default=1000, metadata={"help": "max number of training steps"})
    logging_steps: int = field(default=10, metadata={"help": "logging frequency"})
    save_steps: int = field(default=100, metadata={"help": "saving frequency"})
    eval_steps: int = field(default=100, metadata={"help": "evaluation frequency"})
    output_dir: str = field(default="./results", metadata={"help": "output directory"})
    log_freq: int = field(default=1, metadata={"help": "logging frequency"})
    load_in_4bit: bool = field(default=True, metadata={"help": "whether to load the model in 4bit"})
    model_dtype: str = field(default="float16", metadata={"help": "model_dtype for loading"})

    def get_training_arguments(self) -> TrainingArguments:
        return TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            evaluation_strategy="steps",
            save_steps=self.save_steps,
            logging_steps=self.logging_steps,
            num_train_epochs=3,
            weight_decay=0.01,
            learning_rate=5e-5,
            fp16=True,
        )

class DPOFineTuner:
    def __init__(self, model_name: str, config: DPOTrainingConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=config.torch_dtype)

    def load_dataset(self, dataset_name: str = "jondurbin/truthy-dpo-v0.1"):
        # Load the dataset
        self.dataset = load_dataset(dataset_name)
        self.train_dataset = self.dataset["train[:50]"]

    def fine_tune(self):
        # Initialize DPOTrainer with loaded model, tokenizer, and dataset
        trainer = DPOTrainer(
            model=self.model,
            args=self.config.get_training_arguments(),
            train_dataset=self.train_dataset,
            eval_dataset=None,
            tokenizer=self.tokenizer
        )
        trainer.train()

    def save_model(self, path: str = "./dpo_llama_finetuned_model"):
        # Save model and tokenizer after training
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)


In [4]:
# Initialize configuration
config = DPOTrainingConfig()

# Instantiate fine-tuner with model name and configuration
fine_tuner = DPOFineTuner(model_name="unsloth/llama-3-8b-bnb-4bit", config=config)

# Load dataset and start fine-tuning
fine_tuner.load_dataset()
fine_tuner.fine_tune()
fine_tuner.save_model()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/904 [00:00<?, ?B/s]

truthy-dpo.parquet:   0%|          | 0.00/653k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1016 [00:00<?, ? examples/s]



AttributeError: 'TrainingArguments' object has no attribute 'model_init_kwargs'