# Train Model with DPO

### imports

In [1]:
!pip install trl

Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Download

In [1]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Login huggingface and load data

In [3]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `topicmodeling` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `topi

In [2]:
dataset = load_dataset("EliasHossain/youtube-titles-dpo")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### load model

We are gonna use the unsloth library which is faster at fine-tuning job. Before that we need to connect with T4 GPU from the runtime menu.

In [5]:
! pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.5.9-py3-none-any.whl.metadata (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.5.11 (from unsloth)
  Downloading unsloth_zoo-2025.5.11-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.22-py3-none-any.whl.metadata (10 kB)
Collecting transformers!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2,>=4.51.3 (from unsloth)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting protobuf<4.0.0 (from unsloth)
  Downloading

Install the required library to use the unsloth

In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [3]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import load_dataset, Dataset
import pandas as pd
import json, yaml
import torch


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
from unsloth import FastLanguageModel
import torch

# Set model name
model_name = "unsloth/Qwen3-14B-unsloth-bnb-4bit"  # Use this exact quantized model

# Load model and tokenizer with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,       # You can increase this if needed
    load_in_4bit = True,         # Efficient memory usage
    load_in_8bit = False,        # Keep False if using 4bit
    full_finetuning = False,     # Set True only if you're planning full finetuning
    # token = "hf_...",          # Only needed for gated models
)

# Ensure pad_token is set properly
tokenizer.pad_token = tokenizer.eos_token


==((====))==  Unsloth 2025.5.10: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.5.10 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


### generate title with base model

In [6]:
def format_chat_prompt(user_input, system_message="You are a helpful assistant."):
    """
    Formats user input into the chat template format with <|im_start|> and <|im_end|> tags.

    Args:
        user_input (str): The input text from the user.

    Returns:
        str: Formatted prompt for the model.
    """

    # Format user message
    user_prompt = f"<|im_start|>user\n{user_input}<|im_end|>\n"

    # Start assistant's turn
    assistant_prompt = "<|im_start|>assistant\n"

    # Combine prompts
    formatted_prompt = user_prompt + assistant_prompt

    return formatted_prompt

In [7]:
from transformers import pipeline

# Set up text generation pipeline (DO NOT set device manually)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt — update this based on your actual structure
prompt = format_chat_prompt(dataset['valid']['prompt'][0][0]['content'])

# Generate output
outputs = generator(prompt, max_length=100, truncation=True, num_return_sequences=1, temperature=0.7)

# Print the result
print(outputs[0]['generated_text'])

Device set to use cuda:0


<|im_start|>user
Given the YouTube video idea write an engaging title.

**Video Idea**: intro independent component analysis

**Additional Guidance**:
- Title should be between 30 and 75 characters long
- Only return the title idea, nothing else!<|im_end|>
<|im_start|>assistant
<think>
Okay, the user wants a YouTube video title about introducing independent component analysis. Let me start by understanding what ICA is. It's a technique used in signal processing and machine learning to separate mixed signals into


### train model

In [None]:
# ft_model_name = model_name.split('/')[1].replace("Instruct", "DPO")

# training_args = DPOConfig(
#     output_dir=ft_model_name,
#     logging_steps=25,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     save_strategy="epoch",
#     eval_strategy="epoch",
#     eval_steps=1,
# )

# device = torch.device('mps')

# from trl import DPOTrainer, DPOConfig  # or the appropriate trainer if you're using custom code
# import torch

# # Adjust fine-tuned model name
# ft_model_name = model_name.split('/')[1].replace("Instruct", "DPO")

# # Define DPO training configuration
# training_args = DPOConfig(
#     output_dir=ft_model_name,
#     logging_steps=25,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     save_strategy="epoch",
#     eval_strategy="epoch",
#     eval_steps=1,
# )



In [None]:
# # Take a small subset to prevent session crash
# train_subset = dataset['train'].select(range(10))   # Take first 100 samples
# eval_subset = dataset['valid'].select(range(20))     # Take first 20 samples

In [None]:
# tokenizer.chat_template = "{% for message in messages %}{{ message['content'] }} {% endfor %}"

In [9]:
# print(dataset["train"].column_names)

In [None]:
# def formatting_func(example):
#     return {"text": f"{example['instruction']}\n{example['response']}"}

# # Preprocess dataset BEFORE passing to trainer
# formatted_train = dataset["train"].map(formatting_func)
# formatted_valid = dataset["valid"].map(formatting_func)


In [8]:
from trl import DPOTrainer, DPOConfig

# Define DPO training arguments
training_args = DPOConfig(
    output_dir="qwen3-dpo-checkpoint",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    seed=42,
    report_to="none"
)

# Initialize DPOTrainer
trainer = DPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
)

# Start training
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,026 | Num Epochs = 3 | Total steps = 387
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Epoch,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.5875,0.538479,0.281489,-0.67033,0.698276,0.951819,-76.855049,-91.471428,-2.36195,-2.365478,0,0,0,0
2,0.4283,0.515891,-0.347867,-1.879056,0.689655,1.531189,-83.148605,-103.558685,-2.618694,-2.644292,No Log,No Log,No Log,No Log
3,0.3023,0.521061,0.288767,-1.659094,0.758621,1.947861,-76.782272,-101.35907,-2.50349,-2.543907,No Log,No Log,No Log,No Log


TrainOutput(global_step=387, training_loss=0.41939669216018005, metrics={'train_runtime': 5372.1429, 'train_samples_per_second': 0.573, 'train_steps_per_second': 0.072, 'total_flos': 0.0, 'train_loss': 0.41939669216018005, 'epoch': 3.0})

In [9]:
# Save the best model and tokenizer to a custom directory
save_path = "/content/qwen3-dpo-checkpoint"  # <-- replace later with real path
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")

Model and tokenizer saved to: /content/qwen3-dpo-checkpoint


In [12]:
# from trl import DPOTrainer, DPOConfig

# # Define DPO training arguments
# training_args = DPOConfig(
#     output_dir="qwen3-dpo-checkpoint",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     learning_rate=5e-5,
#     lr_scheduler_type="linear",
#     warmup_ratio=0.1,
#     logging_steps=10,
#     save_strategy="epoch",
#     eval_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     seed=42,
#     report_to="none"
# )

# # Initialize DPOTrainer
# trainer = DPOTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     args=training_args,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["valid"],
# )

# # Start training
# trainer.train()

# # Save the best model and tokenizer to a custom directory
# save_path = "/content/qwen3-dpo-checkpoint"  # <-- replace later with real path
# trainer.model.save_pretrained(save_path)
# tokenizer.save_pretrained(save_path)

# print(f"Model and tokenizer saved to: {save_path}")


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,026 | Num Epochs = 3 | Total steps = 387
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Duplicate pattern: expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'), _users=2)
amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
neg_default = CallFunction(aten.neg.default, div_Tensor_1)
view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, div_Tensor_1, _users=2)
sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
div_Tensor_2 = CallFunction(aten.div.Tensor, fma_default, KeywordArg('inv_scale'))
view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
output = MultiOutputPattern([view_default_5,
  view_default_9,
  permute_default_4,
  view_default_11,
  None
]) with no graph

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


### use fine-tuned model

In [10]:
# Load the fine-tuned model
ft_model = trainer.model

In [11]:
# Set up text generation pipeline (DO NOT set device manually)
generator = pipeline("text-generation", model=ft_model, tokenizer=tokenizer)

# Example prompt — update this based on your actual structure
prompt = format_chat_prompt(dataset['valid']['prompt'][0][0]['content'])

# Generate output
outputs = generator(prompt, max_length=100, truncation=True, num_return_sequences=1, temperature=0.7)

# Print the result
print(outputs[0]['generated_text'])

Device set to use cuda:0


<|im_start|>user
Given the YouTube video idea write an engaging title.

**Video Idea**: intro independent component analysis

**Additional Guidance**:
- Title should be between 30 and 75 characters long
- Only return the title idea, nothing else!<|im_end|>
<|im_start|>assistant
<think>
Okay, the user wants a YouTube video title about introducing independent component analysis. Let me start by understanding what ICA is. It's a technique used in signal processing and machine learning for separating mixed signals into


### push to HF hub

In [None]:
model_id = f"EliasHossain/{ft_model_name}"
trainer.push_to_hub(model_id)

adapter_model.safetensors:   0%|          | 0.00/514M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/6.67k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/EliasHossain/qwen3-dpo-checkpoint/commit/b8f588af893d68d478cdeb09aef59781f71bf280', commit_message='EliasHossain/Qwen3-14B-unsloth-bnb-4bit', commit_description='', oid='b8f588af893d68d478cdeb09aef59781f71bf280', pr_url=None, repo_url=RepoUrl('https://huggingface.co/EliasHossain/qwen3-dpo-checkpoint', endpoint='https://huggingface.co', repo_type='model', repo_id='EliasHossain/qwen3-dpo-checkpoint'), pr_revision=None, pr_num=None)

In [12]:
format_chat_prompt(dataset['valid']['prompt'][0][0]['content'])

'<|im_start|>user\nGiven the YouTube video idea write an engaging title.\n\n**Video Idea**: intro independent component analysis\n\n**Additional Guidance**:\n- Title should be between 30 and 75 characters long\n- Only return the title idea, nothing else!<|im_end|>\n<|im_start|>assistant\n'

# Evaluation

In [13]:
from torch.nn import functional as F
from datasets import load_dataset
from tqdm import tqdm
import torch

# # Use your fine-tuned model directly
# model = trainer.model
# tokenizer = trainer.tokenizer

# Move model to the correct device
device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

# Load the validation dataset (100 samples for speed)
dataset = load_dataset("EliasHossain/youtube-titles-dpo", split="valid[:100]")

# Format prompts using the Qwen-style chat format
def format_chat_prompt(user_input, system_message="You are a helpful assistant."):
    user_prompt = f"<|im_start|>user\n{user_input}<|im_end|>\n"
    assistant_prompt = "<|im_start|>assistant\n"
    return user_prompt + assistant_prompt

# Compute log-probability score for a completion
def get_logprob_score(model, tokenizer, prompt, completion, device="cuda"):
    input_text = prompt + completion
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, :-1, :]
        target_ids = inputs.input_ids[:, 1:]
        log_probs = F.log_softmax(logits, dim=-1)
        selected_log_probs = torch.gather(log_probs, 2, target_ids.unsqueeze(-1)).squeeze(-1)
        return selected_log_probs.sum().item()

# Run evaluation
correct = 0
total = len(dataset)

for sample in tqdm(dataset):
    prompt_text = format_chat_prompt(sample["prompt"][0]["content"])
    chosen = sample["chosen"][0]["content"]
    rejected = sample["rejected"][0]["content"]

    score_chosen = get_logprob_score(model, tokenizer, prompt_text, chosen, device)
    score_rejected = get_logprob_score(model, tokenizer, prompt_text, rejected, device)

    if score_chosen > score_rejected:
        correct += 1

# Print result
accuracy = correct / total
print(f"\n Pairwise preference accuracy: {accuracy:.2%}")


100%|██████████| 100/100 [01:34<00:00,  1.06it/s]


 Pairwise preference accuracy: 58.00%



