In [1]:
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np
from trl import RewardTrainer, RewardConfig
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
REPO = "HuggingFaceTB/SmolLM2-135M-Instruct"
snapshot_download(repo_id=REPO, local_dir="./model")

In [2]:
from data_preprocess import process_pairs_dataset
from config import MODEL_PATH, DATA_PATH, SEED, SCALAR_REWARD_MODEL_PATH

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
reward_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PATH,
    num_labels=1,
    local_files_only=True,
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
reward_model.config.pad_token_id = reward_model.config.eos_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_data = load_dataset(DATA_PATH, split="train")
val_data = load_dataset(DATA_PATH, split="validation")

train_data = process_pairs_dataset(train_data)
val_data = process_pairs_dataset(val_data)

In [7]:
training_args = RewardConfig(
    output_dir="./reward_model",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_strategy="steps",
    logging_steps=300,
    eval_strategy="steps",
    eval_steps=300,
    save_steps=300,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="none",
)

trainer = RewardTrainer(
    model=reward_model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

Для обучение используется стандартная функция потерь $$\mathcal{L}_{RM} = -\log \sigma\left(r_{\theta}(x, y_w) - r_{\theta}(x, y_l)\right)$$

In [8]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
300,0.7542,0.752164,0.59403
600,0.7238,0.688678,0.607784
900,0.6884,0.676795,0.634731
1200,0.6547,0.67341,0.616766


TrainOutput(global_step=1204, training_loss=0.7051763892965855, metrics={'train_runtime': 532.9282, 'train_samples_per_second': 13.555, 'train_steps_per_second': 2.259, 'total_flos': 0.0, 'train_loss': 0.7051763892965855, 'epoch': 1.0})

In [9]:
trainer.save_model(SCALAR_REWARD_MODEL_PATH)