In [30]:
import pandas as pd
import datasets

df = pd.read_parquet("../data/processed/reward_model_pairs.parquet")
dataset = datasets.Dataset.from_pandas(df)

In [31]:
dataset

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 40145
})

In [32]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
import torch
import torch.nn as nn
import torch.nn.functional as F
import lovely_tensors as lt

lt.monkey_patch()

model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ru-en-RoSBERTa", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ru-en-RoSBERTa")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ru-en-RoSBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(98505, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [4]:
class RewardModel(nn.Module):
    def __init__(self, model: str):
        super().__init__()
        self.model_backbone = model.roberta # backbone RoBERTa
        self.device = self.model_backbone.device
        self.regression_head = nn.Linear(self.model_backbone.config.hidden_size, 1).to(self.device)


    # from model system card, prefered pooling method is cls
    def pool(self, hidden_state, mask, pooling_method="cls"):
        if pooling_method == "mean":
            s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
            d = mask.sum(axis=1, keepdim=True).float()
            return s / d
        elif pooling_method == "cls":
            return hidden_state[:, 0]

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        # Get the hidden states from RoBERTa
        outputs = self.model_backbone(input_ids, attention_mask)
        
        # Use the [CLS] token representation (first token) for sequence-level prediction
        cls_hidden_state = self.pool(outputs.last_hidden_state, attention_mask, pooling_method="cls")  # Shape: [batch_size, hidden_size]
        
        # Apply regression head to get a single score per sequence
        reward_score = self.regression_head(cls_hidden_state)  # Shape: [batch_size, 1]
        
        return reward_score
    

reward_model = RewardModel(model=model)

# test model forward pass
input_ids = torch.randint(0, 100, (1, 512))
attention_mask = torch.ones_like(input_ids)
print(input_ids.shape, attention_mask.shape)

res = reward_model(input_ids.to(reward_model.device), attention_mask.to(reward_model.device))
print(f"Output: {res}")

torch.Size([1, 512]) torch.Size([1, 512])
Output: tensor[1, 1] grad LinearBackward0 mps:0 [[0.321]]


In [5]:
data = dataset.train_test_split(test_size=0.1)

data.set_format(type="torch", columns=["input_ids_chosen", "input_ids_rejected", "attention_mask_chosen", "attention_mask_rejected"])

train_dataset = data["train"]
test_dataset = data["test"]
data

DatasetDict({
    train: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 36130
    })
    test: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 4015
    })
})

In [29]:
reward_config = RewardConfig(
    output_dir="../models/reward_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="../models/logs",
    learning_rate=1e-5,
    do_eval=True,
    # report_to='wandb',
    max_length=512,
    logging_steps=50,
    run_name='reward_model_train',
    remove_unused_columns=True,
    bf16=False,
    fp16=False,
    # use_mps_device=True,
    no_cuda=True,
    use_cpu=False,
)

trainer = RewardTrainer(
    model=model,
    args=reward_config,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    processing_class=tokenizer,
    compute_metrics=None,
)

trainer.train()
trainer.save_model('../models/reward_model/final_checkpoint')



Step,Training Loss


KeyboardInterrupt: 