# Direct Preference Optimization (DPO) Fine-Tuning

In this model we fine-tune a language model using Direct Preference Optimization (DPO), a technique for aligning language models with human preferences without requiring a separate reward model.

## Overview
- **Model**: GPT-2
- **Technique**: DPO with LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning
- **Dataset**: UltraFeedback Binarized - a preference dataset with chosen/rejected response pairs

## Requirements
- Python 3.9+
- PyTorch 2.x
- CUDA-capable GPU (optional, but recommended for faster training)

In [None]:
# Install required packages (versions compatible with PyTorch 2.9.1)
!pip install torch==2.9.1
!pip install transformers>=4.57.0 trl>=0.25.0 peft>=0.14.0 accelerate
!pip install datasets matplotlib pandas numpy

In [None]:
# Import necessary libraries
import multiprocessing
import os
import pandas as pd
import matplotlib.pyplot as plt

import torch
from datasets import load_dataset

from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GPT2Tokenizer,
    set_seed,
    GenerationConfig
)
from trl import DPOConfig, DPOTrainer

# Check device availability
if torch.cuda.is_available():
    device = "cuda"
    print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
else:
    device = "cpu"
    print("Using CPU (training will be slower)")

## Model Loading

The following cell loads the model with optional 4-bit quantization. Quantization will significantly reduces memory usage but does **require a CUDA GPU**. If running on CPU, the model will load in full precision.

In [None]:
# Load model with optional quantization (GPU only)
use_quantization = torch.cuda.is_available()

if use_quantization:
    # Install bitsandbytes for quantization (CUDA only)
    import subprocess
    subprocess.run(["pip", "install", "-U", "bitsandbytes"], check=True)
    
    from transformers import BitsAndBytesConfig
    
    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Load GPT-2 model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        "gpt2",
        quantization_config=quantization_config
    )
    print("Loaded model with 4-bit quantization")
else:
    # Load GPT-2 model in full precision for CPU
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    model = model.to(device)
    print(f"Loaded model in full precision on {device}")

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Disable cache for training
model.config.use_cache = False

# Display model architecture
print(f"\nModel parameters: {model.num_parameters():,}")
model

## Dataset Preparation

We use the UltraFeedback Binarized dataset, which contains prompts with chosen (preferred) and rejected prompt responses.

In [None]:
# Load the ultrafeedback_binarized dataset
ds = load_dataset("BarraHome/ultrafeedback_binarized")
print("Dataset keys:", ds.keys())

In [None]:
# Examine the structure of the dataset
print("Keys in first entry:", ds["train_prefs"][0].keys())
print("\nExample entry:")
ds["train_prefs"][0]

In [None]:
# Subset the dataset for faster training (adjust cnt for your needs)
cnt = 50  # Number of examples to use
for key in ds:
    ds[key] = ds[key].select(range(min(cnt, len(ds[key]))))

def process(row):
    """Process dataset row for DPO training format."""
    # Remove unnecessary fields
    del row["prompt_id"]
    del row["messages"]
    del row["score_chosen"]
    del row["score_rejected"]
    
    # Keep only the last message content for chosen and rejected responses
    row["chosen"] = row["chosen"][-1]["content"]
    row["rejected"] = row["rejected"][-1]["content"]
    return row

# Process the dataset
ds = ds.map(
    process,
    num_proc=multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

# Split into training and evaluation sets
train_dataset = ds['train_prefs']
eval_dataset = ds['test_prefs']

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

In [None]:
# Examine processed training data
print("Processed training example:")
train_dataset[0]

## LoRA Configuration

We use Low-Rank Adaptation (LoRA) for parameter-efficient fine-tuning (PEFT). This allows us to train only a small number of additional parameters while keeping the base model frozen.

In [None]:
# Define LoRA configuration
peft_config = LoraConfig(
    r=4,                              # Rank of the low-rank matrices
    target_modules=['c_proj', 'c_attn'],  # GPT-2 attention modules
    task_type="CAUSAL_LM",
    lora_alpha=8,                     # Scaling factor
    lora_dropout=0.1,
    bias="none",
)

print("LoRA Configuration:")
print(f"  Rank (r): {peft_config.r}")
print(f"  Alpha: {peft_config.lora_alpha}")
print(f"  Target modules: {peft_config.target_modules}")

## DPO Training Configuration


In [None]:
# Configure DPO training
training_args = DPOConfig(
    beta=0.2,                         # DPO temperature parameter
    output_dir="dpo",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    remove_unused_columns=False,
    logging_steps=10,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    eval_strategy="epoch",            # Note: 'evaluation_strategy' is deprecated
    warmup_steps=2,
    fp16=False,
    save_steps=500,
    report_to='none'
)

In [None]:
# Create DPO trainer
trainer = DPOTrainer(
    model=model,
    ref_model=None,                   # Not needed when using LoRA
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,       # Note: 'tokenizer' param is deprecated
    peft_config=peft_config,
    max_length=512,
)

In [None]:
# Start training
print("Starting DPO training...")
trainer.train()

## Training Visualization

In [None]:
# Plot training and evaluation loss
log = pd.DataFrame(trainer.state.log_history)
log_train = log[log['loss'].notna()]
log_eval = log[log['eval_loss'].notna()]

plt.figure(figsize=(10, 6))
plt.plot(log_train["epoch"], log_train["loss"], label="Training Loss", marker='o')
plt.plot(log_eval["epoch"], log_eval["eval_loss"], label="Evaluation Loss", marker='s')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("DPO Training Progress")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Model Inference and Comparison

Let's compare the outputs of the DPO-finetuned model with the base GPT-2 model.

In [None]:
# Find the latest checkpoint
import os
import glob

checkpoints = glob.glob('./dpo/checkpoint-*')
if checkpoints:
    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[-1]))
    print(f"Loading checkpoint: {latest_checkpoint}")
    dpo_model = AutoModelForCausalLM.from_pretrained(latest_checkpoint)
else:
    print("No checkpoint found, using the trained model directly")
    dpo_model = trainer.model

# Move to appropriate device
dpo_model = dpo_model.to(device)

In [None]:
# Reload tokenizer for inference
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Set seed for reproducibility
set_seed(42)

# Configure generation parameters
generation_config = GenerationConfig(
    do_sample=True,
    top_k=2,
    temperature=0.2,
    max_new_tokens=25,
    pad_token_id=tokenizer.eos_token_id
)

# Test prompt
PROMPT = "Is a higher octane gasoline better for your car?"
inputs = tokenizer(PROMPT, return_tensors='pt').to(device)

# Generate with DPO model
outputs = dpo_model.generate(**inputs, generation_config=generation_config)
print("DPO response:\t", tokenizer.decode(outputs[0], skip_special_tokens=True))

# Generate with base GPT-2 model
gpt2_model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
outputs = gpt2_model.generate(**inputs, generation_config=generation_config)
print("\nGPT-2 response:\t", tokenizer.decode(outputs[0], skip_special_tokens=True))

---

## Phase 2: Training with User Preferences Dataset

In this next phase we conduct a DPO training using a user preference dataset.

In [None]:
# Load alternative user preferences dataset
dataset = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")
print(f"Dataset size: {len(dataset['train'])} examples")
dataset['train']

In [None]:
# Select a subset for training
cnt = 100  # Adjust based on available compute
dataset['train'] = dataset['train'].select(range(cnt))

In [None]:
def process_preferences(row):
    """Process user preference dataset row."""
    # Remove unwanted columns
    del row["source"]
    del row["chosen-rating"]
    del row["chosen-model"]
    del row["rejected-rating"]
    del row["rejected-model"]
    
    # Extract response content
    row["chosen"] = row["chosen"][-1]["content"]
    row["rejected"] = row["rejected"][-1]["content"]
    return row

# Process the dataset
dataset['train'] = dataset['train'].map(
    process_preferences,
    num_proc=multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

In [None]:
# Split into training and evaluation sets (80/20)
train_size = int(0.8 * len(dataset['train']))
eval_size = len(dataset['train']) - train_size

train_dataset_v2 = dataset['train'].select(range(train_size))
eval_dataset_v2 = dataset['train'].select(range(train_size, train_size + eval_size))

print(f"Training samples: {len(train_dataset_v2)}")
print(f"Evaluation samples: {len(eval_dataset_v2)}")

In [None]:
# Examine processed data
print("Sample processed entry:")
train_dataset_v2[0]

## Response Generation Functions

We use a helper functions to compare DPO and base GPT-2 responses to see the difference between the the fine-tuned model and baseline. 

In [None]:
# Generation configuration for comparison
comparison_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=25,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
def generate_dpo_response(prompt):
    """Generate response using DPO-finetuned model."""
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    outputs = dpo_model.generate(**inputs, generation_config=comparison_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
def generate_gpt2_response(prompt):
    """Generate response using base GPT-2 model."""
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    outputs = gpt2_model.generate(**inputs, generation_config=comparison_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Test with a custom prompt
test_prompt = "What is the best way to learn programming?"

print(f"Prompt: {test_prompt}\n")
print("DPO response:\t", generate_dpo_response(test_prompt))
print("\nGPT-2 response:\t", generate_gpt2_response(test_prompt))