In [None]:
# !pip install transformers trl datasets llm_blender

# Итоги эксперементов
## Метод обучения - win rate  
ORPO из коробки - 0.08  
ORPO c PRL - 0.04  
SFT + ORPO - 0.07  
SFT + OR - 0.11

Использование PRL ухудшает результаты (win rate падает до 4%). Это может говорить о том, что PRL слишком сильно ограничивает обновления параметров модели, снижая её способность к адаптации. Возможно, модель становится слишком консервативной и просто не делает изменений, которые могли бы привести к победе над эталоном.  

FT + OR (11%) — лучший результат

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import ORPOTrainer, ORPOConfig, SFTTrainer, SFTConfig
from datasets import load_dataset
from tqdm import tqdm
import llm_blender
import numpy as np
import torch.nn.functional as F
from transformers import DataCollatorWithPadding
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from datasets import Dataset
import gc
import subprocess
from typing import Union, Literal

In [6]:
# Загрузка модели и токенизатора
model_name = "HuggingFaceTB/SmolLM2-135M"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
tokenizer.chat_template = "{% for message in messages %}{{message['content'] + '\n'}}{% endfor %}{% if add_generation_prompt %}{% endif %}"

In [8]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def load_and_preprocess_dataset():
    dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
    
    def preprocess(example):
        return {
            "prompt": example["prompt"],
            "chosen": example["chosen"],
            "rejected": example["rejected"]
        }
    
    # Предобработка тренировочного набора данных
    train_dataset = dataset["train_prefs"].map(preprocess, remove_columns=["messages"])
    test_dataset = dataset["test_prefs"].map(preprocess, remove_columns=["messages"])
    
    return train_dataset, test_dataset

train_dataset, test_dataset = load_and_preprocess_dataset()

README.md:   0%|          | 0.00/6.53k [00:00<?, ?B/s]

train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/61135 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
train_dataset = train_dataset.select(range(6000)) 
test_dataset = test_dataset.select(range(100))

# ORPO из коробки

In [10]:
training_args = ORPOConfig(output_dir="./orpo",
                           learning_rate = 8e-6,
                           lr_scheduler_type="linear",
                           beta=0.1,
                           per_device_train_batch_size=1,
                           gradient_accumulation_steps=16,
                           num_train_epochs=2,
                           logging_steps=10,
                           warmup_steps=10,
                           report_to="none")

In [11]:
trainer = ORPOTrainer(model=model,
                      args=training_args,
                      processing_class=tokenizer,
                      train_dataset=train_dataset)

trainer.train()



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Step,Training Loss
10,2.1792
20,2.0968
30,2.1091
40,2.041
50,2.037
60,2.1044
70,2.0284
80,2.0389
90,2.0246
100,2.076


TrainOutput(global_step=374, training_loss=2.0208335152284347, metrics={'train_runtime': 4726.024, 'train_samples_per_second': 2.539, 'train_steps_per_second': 0.079, 'total_flos': 0.0, 'train_loss': 2.0208335152284347, 'epoch': 1.992})

In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
strt_texts = []
for dialog in test_dataset["chosen"]:
    strt_texts.append(tokenizer.apply_chat_template(dialog, tokenize=False))

prompts = test_dataset["prompt"]

In [14]:
blender = llm_blender.Blender()
blender.loadranker("llm-blender/PairRM")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

ranker_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Successfully loaded ranker from  /root/.cache/huggingface/hub/llm-blender/PairRM


In [15]:
def calculate_win_rate(model, tokenizer, test_dataset, blender, device='cuda'):
    win_count = 0
    inputs = tokenizer(test_dataset["prompt"], return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs)
    model_responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    candidates = list(zip(model_responses, strt_texts))
    scores = blender.rank(inputs=test_dataset["prompt"], candidates=candidates, return_scores=False, batch_size=1)
    # print(scores)
    win_count = sum([np.argmax(rank) == 1 for rank in scores])
    win_rate = win_count / len(test_dataset)
    print(f"Win rate: {win_rate * 100:.2f}%")
    return win_rate

In [16]:
calculate_win_rate(trainer.model, tokenizer, test_dataset, blender, device='cuda')

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Ranking candidates: 100%|██████████| 100/100 [00:41<00:00,  2.39it/s]

Win rate: 8.00%





0.08

# ORPO с PRL

In [17]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [18]:
class ProbabilityRatioTrainer(ORPOTrainer):
    def odds_ratio_loss(
        self,
        policy_chosen_logps: torch.FloatTensor,
        policy_rejected_logps: torch.FloatTensor,
    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
        """Compute ORPO's probability ratio (PR) loss for a batch of policy and reference model log probabilities.

        Args:
            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)

        Returns:
            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
            The losses tensor contains the PR loss for each example in the batch.
            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
            The log probability ratio of the chosen responses over the rejected responses for logging purposes.
            The `log(sigmoid(log_prob_ratio))` for logging purposes.
        """

        # Compute the probability ratio
        log_prob_ratio = policy_chosen_logps - policy_rejected_logps
        ratio = F.logsigmoid(log_prob_ratio)
        losses = -self.beta * ratio  # Negative because we want to maximize the ratio

        # Compute rewards for logging
        chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach()
        rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach()

        return losses, chosen_rewards, rejected_rewards, torch.mean(ratio), torch.mean(log_prob_ratio)

In [19]:
gc.collect()
torch.cuda.empty_cache()

In [20]:
trainer = ProbabilityRatioTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)
trainer.train()

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Step,Training Loss
10,2.0205
20,1.9499
30,1.9521
40,1.8908
50,1.8902
60,1.9568
70,1.8803
80,1.8871
90,1.8711
100,1.9178


TrainOutput(global_step=374, training_loss=1.870183730507917, metrics={'train_runtime': 4730.9388, 'train_samples_per_second': 2.536, 'train_steps_per_second': 0.079, 'total_flos': 0.0, 'train_loss': 1.870183730507917, 'epoch': 1.992})

In [21]:
calculate_win_rate(trainer.model, tokenizer, test_dataset, blender, device='cuda')

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Ranking candidates: 100%|██████████| 100/100 [00:39<00:00,  2.55it/s]

Win rate: 4.00%





0.04

In [22]:
gc.collect()
torch.cuda.empty_cache()

# ORPO + SFT

In [23]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [24]:
strt_texts = []
for dialog in test_dataset["chosen"]:
    strt_texts.append(tokenizer.apply_chat_template(dialog, tokenize=False))

prompts = test_dataset["prompt"]

In [25]:
def format_data(examples):
    # formatted_prompts = []
    # formatted_completions = []
    # print(examples)
    formatted_prompts = examples['prompt']
    formatted_completions = tokenizer.apply_chat_template(examples['chosen'], tokenize=False)
    return {"prompt": formatted_prompts, "completion": formatted_completions}

formatted_dataset = train_dataset.map(format_data, batched=True)
filtered_dataset = Dataset.from_dict({
    "prompt": formatted_dataset["prompt"],
    "completion": formatted_dataset["completion"]
})

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [26]:
training_args_sft = TrainingArguments(
    output_dir="./results_sft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate = 5e-5,
    logging_steps=10,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    args=training_args_sft,
    train_dataset=filtered_dataset,
)

trainer.train()
trainer.save_model("./results_sft")

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/6000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/6000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/6000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/6000 [00:00<?, ? examples/s]



Step,Training Loss
10,13.1127
20,12.3502
30,12.3576
40,12.2307
50,11.9368
60,12.3902
70,12.0213
80,11.7952
90,11.8523
100,12.204


In [27]:
training_args = ORPOConfig(output_dir="./orpo",
                           learning_rate = 8e-6,
                           lr_scheduler_type="linear",
                           beta=0.1,
                           per_device_train_batch_size=1,
                           gradient_accumulation_steps=16,
                           num_train_epochs=1,
                           logging_steps=10,
                           warmup_steps=10,
                           report_to="none")

trainer = ORPOTrainer(model=model,
                      args=training_args,
                      processing_class=tokenizer,
                      train_dataset=train_dataset)

trainer.train()



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Step,Training Loss
10,1.9135
20,1.8658
30,1.8908
40,1.854
50,1.853
60,1.9203
70,1.8654
80,1.8758
90,1.8784
100,1.9388


TrainOutput(global_step=187, training_loss=1.9111019022324507, metrics={'train_runtime': 2367.5016, 'train_samples_per_second': 2.534, 'train_steps_per_second': 0.079, 'total_flos': 0.0, 'train_loss': 1.9111019022324507, 'epoch': 0.9973333333333333})

In [28]:
calculate_win_rate(model, tokenizer, test_dataset, blender, device='cuda')

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Ranking candidates: 100%|██████████| 100/100 [00:39<00:00,  2.54it/s]

Win rate: 7.00%





0.07

In [29]:
gc.collect()
torch.cuda.empty_cache()

# OR c SFT

In [30]:
model = AutoModelForCausalLM.from_pretrained("./results_sft")

In [31]:
train_dataset

Dataset({
    features: ['prompt', 'prompt_id', 'chosen', 'rejected', 'score_chosen', 'score_rejected'],
    num_rows: 6000
})

In [32]:
class ORPOTrainerlor(ORPOTrainer):
   def get_batch_loss_metrics(
        self,
        model,
        batch: dict[str, Union[list, torch.LongTensor]],
        train_eval: Literal["train", "eval"] = "train",
    ):
        """Compute the OR loss and other metrics for the given batch of inputs for train or test."""
        metrics = {}

        forward_output = self.concatenated_forward(model, batch)
        (
            policy_chosen_logps,
            policy_rejected_logps,
            policy_chosen_logits,
            policy_rejected_logits,
            policy_nll_loss,
        ) = forward_output[:5]
        if self.aux_loss_enabled:
            aux_loss = forward_output[5]

        # Compute log odds ratio loss
        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss(
            policy_chosen_logps, policy_rejected_logps
        )
        loss = -losses.mean()

        reward_accuracies = (chosen_rewards > rejected_rewards).float()

        prefix = "eval_" if train_eval == "eval" else ""
        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean()
        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean()
        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean()
        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
            chosen_rewards - rejected_rewards
        ).mean()
        metrics[f"{prefix}logps/rejected"] = self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
        metrics[f"{prefix}logps/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
        metrics[f"{prefix}logits/rejected"] = (
            self.accelerator.gather_for_metrics(policy_rejected_logits).detach().mean()
        )
        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logits).detach().mean()
        metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
        metrics[f"{prefix}log_odds_ratio"] = self.accelerator.gather_for_metrics(log_odds_ratio).mean()
        metrics[f"{prefix}log_odds_chosen"] = self.accelerator.gather_for_metrics(log_odds_chosen).mean()
        
        # if is_torch_xla_available():
        #     xm.mark_step()  # needed because .item() calls
        
        for k, v in metrics.items():
            metrics[k] = v.item()
        
        if self.aux_loss_enabled:
            loss += self.aux_loss_coef * aux_loss

        return loss, metrics


In [33]:
training_args = ORPOConfig(output_dir="./orpo",
                           learning_rate = 8e-6,
                           lr_scheduler_type="linear",
                           beta=0.1,
                           per_device_train_batch_size=1,
                           gradient_accumulation_steps=16,
                           num_train_epochs=1,
                           logging_steps=10,
                           warmup_steps=10,
                           report_to="none")



trainer = ORPOTrainerlor(model=model,
                      args=training_args,
                      processing_class=tokenizer,
                      train_dataset=train_dataset)

# Дообучаем модель
trainer.train()

calculate_win_rate(model, tokenizer, test_dataset, blender, device='cuda')



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Step,Training Loss
10,0.0752
20,0.0694
30,0.0752
40,0.0714
50,0.0688
60,0.0681
70,0.0689
80,0.0704
90,0.0708
100,0.0737


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Ranking candidates: 100%|██████████| 100/100 [00:39<00:00,  2.56it/s]

Win rate: 11.00%





0.11