# Direct Preference Optimization (DPO) Fine-tuning Notebook

### 1. Setup and Install Required Libraries

In [2]:
!pip install -q trl transformers datasets accelerate peft bitsandbytes


#### 2. Load Dataset (sample from Hugging Face or custom dataset)


In [5]:
from datasets import load_dataset

raw_dataset = load_dataset("Intel/orca_dpo_pairs")
train_test_split = raw_dataset["train"].train_test_split(test_size=0.1)
dataset = {
    "train": train_test_split["train"],
    "test": train_test_split["test"]
}

print("\nTrain size:", len(dataset["train"]))
print("Test size:", len(dataset["test"]))



Train size: 11573
Test size: 1286


In [None]:
# Sample format
print("\nSample Record:", dataset['train'][0])


Sample Record: {'system': 'You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.', 'question': 'Alex fell off their skateboard and into the ditch.  Given the question "What will Alex want to do next?", is "go to a doctor" a valid answer?\nPick your answer from:\n(A). Yes\n(B). No\nAnswer:', 'chosen': '(A). Yes\n\nStep 1: Understand the situation - Alex fell off their skateboard and into the ditch.\n\nStep 2: Consider possible outcomes - Alex might be injured or in pain after the fall.\n\nStep 3: Determine the user\'s question - The user wants to know if "go to a doctor" is a valid answer for what Alex might want to do next.\n\nStep 4: Evaluate the answer options - We have two options, (A) Yes or (B) No.\n\nStep 5: Justify selection - Since falling into a ditch after a skateboarding accident can potentially result in injuries, it is reasonable to consider t

### 3. Load Base Model and Tokenizer

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "gpt2"  # You can replace with any other causal LM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# Pad token fix
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

### 4. Prepare TrainingArguments and DPOConfig

In [10]:
from transformers import TrainingArguments
from trl import DPOConfig

training_args = TrainingArguments(
    output_dir="./dpo-output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    # evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=5e-6,
    warmup_steps=50,
    num_train_epochs=1,
    report_to="none",
    fp16=True,
)

In [12]:
dpo_config = DPOConfig(
    beta=0.1,
    max_prompt_length=512,
    max_length=1024,
    # log_with=None,
)

ValueError: Your setup doesn't support bf16/gpu.

### 5. Load DPOTrainer

In [13]:
from trl import DPOTrainer

trainer = DPOTrainer(
    model=model,
    ref_model=None,  # If None, a frozen copy of the model will be used internally
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dpo_config=dpo_config,
)

NameError: name 'dpo_config' is not defined

### 6. Start Training

In [None]:
trainer.train()

### 7. Save & Evaluate

In [None]:
trainer.save_model("./dpo-model")

### 8. Inference Example

In [None]:
prompt = "Why is the sky blue?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print("\nGenerated Response:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

### 9. Push to Hub

In [None]:
trainer.push_to_hub("your-username/dpo-finetuned-model")