# Direct Preference Optimization (DPO)
In this project, we fine-tune the <b>GPT-2</b> model using DPO. In this regard, we import the required models and libraries from <a href="https://huggingface.co/" target="_blank"><strong>Hugging Face</strong></a>

## Import Libraries

In [1]:
import time
from datasets import load_dataset
from datasets import DatasetDict
from transformers import AutoTokenizer           # loads tokenizer for the chosen model
from transformers import AutoModelForCausalLM    # loads a causal language model
from trl import DPOTrainer                       # loads trainer for DPO
from trl import DPOConfig                        # defines DPO hyperparameters

import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [2]:
orig_dataset = load_dataset("Dahoas/rm-static")
orig_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 76256
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 5103
    })
})

In [3]:
n_train = 2000
n_eval  = 500

dataset = DatasetDict({
    "train": orig_dataset["train"].select(range(n_train)),
    "test": orig_dataset["test"].select(range(n_eval))
})
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['prompt', 'response', 'chosen', 'rejected'],
        num_rows: 500
    })
})

## Define Checkpoint

In [4]:
model_name = "gpt2"

## Load Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# some models do not own a dedicated padding token; thus, we set it manually
# using end-of-sequence (eos) token to avoid errors
tokenizer.pad_token = tokenizer.eos_token

## Tokenize Dataset

In [6]:
def tokenization_fn(example):
    out = tokenizer(
        example["prompt"] + example["chosen"],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors=None
    )
    
    return out

tokenized_dataset = dataset["train"].map(tokenization_fn)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## Load Model

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

## Define Training Hyperparameters
We have a list of parameters and hyperparameters to set:
<ul>
    <li>
        <b>output_dir:</b> the directory where checkpoints are saved.
    </li>
    <li>
        <b>per_device_train_batch_size:</b> keeps virtual random access memeory (VRAM) usage low.
    </li>
    <li>
        <b>learning_rate:</b> learning rate.
    </li>
    <li>
        <b>logging_steps:</b> number of steps for logging loss.
    </li>
    <li>
        <b>betal:</b> preference sharpness; higher value, stronger preference.
    </li>
    <li>
        <b>max_length:</b> maximum length of tokens.
    </li>
</ul>

In [8]:
args = DPOConfig(
    output_dir='output_dir/dpo',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-6,
    beta=0.1,
    max_length=512,
    fp16=False,          # disables half precision (if it is not CUDA)
    bf16=False,          # disables bfloat16 (if it is not CUDA)
    no_cuda=True,
    use_mps_device=False
)

## Initialize Trainer

In [9]:
trainer = DPOTrainer(
    model=model,
    ref_model=None,                            # will clone base as frozen reference internally
    args=args,
    train_dataset=tokenized_dataset["train"],
    processing_class=tokenizer
)

start_time = time.time()
trainer.train()
train_time = time.time() - start_time
print(f"Trianing time: {train_time: .4f}")

Extracting prompt in train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Step,Training Loss
10,0.6846
20,0.672
30,0.6685


Trianing time:  1618.5788


## Save Model and Tokenizer

In [10]:
# model.save_pretrained("models/fine_tuning/dpo")
# tokenizer.save_pretrained("models/fine_tuning/dpo")

## Evaluation

### Preference Accuracy

In [11]:
import torch

def log_prob(text, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    full_input = tokenizer(prompt + text, return_tensors="pt").to(model.device)
    labels = full_input["input_ids"].clone()
    labels[:, :len(inputs["input_ids"][0])] = -100  # mask prompt tokens

    with torch.no_grad():
        outputs = model(**full_input, labels=labels)
        
    return -outputs.loss.item() * len(labels[0])  # sum of log-probs

In [13]:
eval_dataset = dataset["test"]

correct = 0
for example in eval_dataset:
    lp_chosen = log_prob(example["chosen"], example["prompt"])
    lp_rejected = log_prob(example["rejected"], example["prompt"])
    if lp_chosen > lp_rejected:
        correct += 1

pref_acc = correct / len(eval_dataset)
print(f"Preference Accuracy = {pref_acc:.3f}")

Preference Accuracy = 0.456
