# Direct Preference Optimization (DPO) Fine-Tuning of Llama Model
This notebook demonstrates how to fine-tune a Llama model using the Direct Preference Optimization (DPO) technique with the `jondurbin/truthy-dpo-v0.1` dataset. The code is encapsulated within a class structure for modularity and reusability.

In [1]:
!pip install -U transformers trl bitsandbytes peft



In [2]:
!pip install transformers trl datasets bitsandbytes




In [3]:
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from trl import DPOTrainer
from peft import get_peft_model, LoraConfig, TaskType


### Modifications Applied:
1. **Library Update**: Updated `transformers`, `trl`, and `bitsandbytes` libraries to ensure compatibility.
2. **Trainer Fallback**: Replaced `DPOTrainer` with Hugging Face's `Trainer` in the `fine_tune` method of `DPOFineTuner`.
   - This change was made to avoid compatibility issues with `TrainingArguments` attributes.


In [4]:
@dataclass
class DPOTrainingConfig:
    torch_dtype = torch.float
    ignore_bias_buffers: bool = field(
        default=False,
        metadata={
            "help": "Fix for DDP issues with LM bias/mask buffers - invalid scalar type, inplace operation. See Hugging Face issues"
        }
    )
    per_device_train_batch_size: int = field(default=4, metadata={"help": "train batch size per device"})
    per_device_eval_batch_size: int = field(default=1, metadata={"help": "eval batch size per device"})
    gradient_accumulation_steps: int = field(default=4, metadata={"help": "number of gradient accumulation steps"})
    gradient_checkpointing: bool = field(default=True, metadata={"help": "use gradient checkpointing"})
    gradient_checkpointing_use_reentrant: bool = field(default=False, metadata={"help": "use reentrant for gradient checkpointing"})

    # LoRA Parameters
    lora_alpha: float = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: float = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: int = field(default=8, metadata={"help": "the lora r parameter"})

    # Other Parameters
    max_prompt_length: int = field(default=512, metadata={"help": "maximum prompt length"})
    max_length: int = field(default=1024, metadata={"help": "maximum sequence length"})
    max_steps: int = field(default=1000, metadata={"help": "max number of training steps"})
    logging_steps: int = field(default=10, metadata={"help": "logging frequency"})
    save_steps: int = field(default=100, metadata={"help": "saving frequency"})
    eval_steps: int = field(default=100, metadata={"help": "evaluation frequency"})
    output_dir: str = field(default="./results", metadata={"help": "output directory"})
    log_freq: int = field(default=1, metadata={"help": "logging frequency"})
    load_in_4bit: bool = field(default=True, metadata={"help": "whether to load the model in 4bit"})
    model_dtype: str = field(default="float16", metadata={"help": "model_dtype for loading"})

    def get_training_arguments(self) -> TrainingArguments:
        return TrainingArguments(
            output_dir=self.output_dir,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            evaluation_strategy="steps",
            save_steps=self.save_steps,
            logging_steps=self.logging_steps,
            num_train_epochs=3,
            weight_decay=0.01,
            learning_rate=5e-5,
            fp16=True,
        )



In [7]:
# Modify the fine-tune method to use Trainer as a fallback if DPOTrainer is incompatible
from transformers import Trainer
class DPOFineTuner:
    def __init__(self, model_name: str, config: DPOTrainingConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model with specified dtype and add LoRA adapters
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=config.torch_dtype)
        self.add_lora_adapters()

    def add_lora_adapters(self):
        # Configure and attach LoRA adapters to the model
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            r=self.config.lora_r,
            lora_alpha=self.config.lora_alpha,
            lora_dropout=self.config.lora_dropout
        )
        self.model = get_peft_model(self.model, lora_config)

    def preprocess_function(self, examples):
    # Tokenize the inputs (prompt + chosen or prompt + rejected)
        chosen_encodings = self.tokenizer(
            [p + c for p, c in zip(examples["prompt"], examples["chosen"])],
            truncation=True,
            max_length=self.config.max_length,
            padding="max_length"
        )
        rejected_encodings = self.tokenizer(
            [p + r for p, r in zip(examples["prompt"], examples["rejected"])],
            truncation=True,
            max_length=self.config.max_length,
            padding="max_length"
        )

    # Return the tokenized outputs with necessary fields
        return {
            "input_ids": chosen_encodings["input_ids"],
            "attention_mask": chosen_encodings["attention_mask"],
            "labels": chosen_encodings["input_ids"],
            "rejected_input_ids": rejected_encodings["input_ids"],
            "rejected_attention_mask": rejected_encodings["attention_mask"]
        }

    def load_dataset(self, dataset_name: str = "jondurbin/truthy-dpo-v0.1"):
        self.dataset = load_dataset(dataset_name)

        if "validation" not in self.dataset:
            # Split training data into train and validation if validation is missing
            train_valid_split = self.dataset["train"].train_test_split(test_size=0.1)
            self.train_dataset = train_valid_split["train"]
            self.eval_dataset = train_valid_split["test"]
        else:
            self.train_dataset = self.dataset["train"]
            self.eval_dataset = self.dataset["validation"]

        # Preprocess datasets
        self.train_dataset = self.train_dataset.map(preprocess_function, batched=True)
        self.eval_dataset = self.eval_dataset.map(preprocess_function, batched=True)

    def fine_tune(self):
        # Initialize Trainer with trainable adapters
        trainer = Trainer(
            model=self.model,
            args=self.config.get_training_arguments(),
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            tokenizer=self.tokenizer
        )
        trainer.train()

    def save_model(self, path: str = "./dpo_llama_finetuned_model"):
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

In [6]:
# Initialize configuration
config = DPOTrainingConfig()

# Instantiate fine-tuner with model name and configuration
fine_tuner = DPOFineTuner(model_name="unsloth/llama-3-8b-bnb-4bit", config=config)

# Load dataset and start fine-tuning
fine_tuner.load_dataset()
fine_tuner.fine_tune()
fine_tuner.save_model()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


NameError: name 'preprocess_function' is not defined