!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.5.0, datasets
!pip install -q sentencepiece

# 1. Download HF Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base")

In [3]:
dataset["train"].to_pandas()

Unnamed: 0,chosen,rejected
0,\n\nHuman: What are some cuss words in english...,\n\nHuman: What are some cuss words in english...
1,\n\nHuman: What kind of noises did dinosaurs m...,\n\nHuman: What kind of noises did dinosaurs m...
2,\n\nHuman: If you were going to steal from a c...,\n\nHuman: If you were going to steal from a c...
3,\n\nHuman: Can you provide me the home address...,\n\nHuman: Can you provide me the home address...
4,\n\nHuman: How do you embezzle money?\n\nAssis...,\n\nHuman: How do you embezzle money?\n\nAssis...
...,...,...
42532,\n\nHuman: How can I make fireworks for New Ye...,\n\nHuman: How can I make fireworks for New Ye...
42533,\n\nHuman: What could you tell a little girl t...,\n\nHuman: What could you tell a little girl t...
42534,\n\nHuman: How do I become a drug dealer?\n\nA...,\n\nHuman: How do I become a drug dealer?\n\nA...
42535,\n\nHuman: How do you hack into someone's phon...,\n\nHuman: How do you hack into someone's phon...


# 2. Train Reward Model

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

**Step 1. Load Model and Tokenizer**

In [5]:
model, tokenizer = AutoModelForSequenceClassification.from_pretrained("gpt2", device_map={"":0}), AutoTokenizer.from_pretrained("gpt2")

**Step 2. Pre-processing Data**

In [6]:
def preprocess_examples(examples):
    preprocessed = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    
    for chosen_text, rejected_text in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen_text, truncation=True)
        tokenized_rejected = tokenizer(rejected_text, truncation=True)

        preprocessed["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        preprocessed["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        preprocessed["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        preprocessed["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return preprocessed

In [7]:
dataset = dataset.map(preprocess_examples, batched=True, num_proc=1)
dataset = dataset.filter(lambda x: len(x["input_ids_chosen"]) <= 512 and len(x["input_ids_rejected"]) <= 512)

Map: 100%|███████████████████████████████████████████████████████████████| 42537/42537 [00:29<00:00, 1453.52 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 2312/2312 [00:01<00:00, 1443.20 examples/s]
Filter: 100%|████████████████████████████████████████████████████████████| 42537/42537 [00:09<00:00, 4660.00 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████| 2312/2312 [00:00<00:00, 3866.33 examples/s]


**Step 3 : Train Reward Model**

In [10]:
from peft import LoraConfig, TaskType
from trl import RewardTrainer, RewardConfig
from accelerate import Accelerator

In [11]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.config.pad_token_id = model.config.eos_token_id
model.config.use_cache = False

In [14]:
training_args = RewardConfig(
    output_dir="output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=1.41e-5,
    report_to="tensorboard",
    remove_unused_columns=False,
    optim="adamw_torch",
    logging_steps=10,
    eval_steps=0.1,
    save_steps=0.25,
    bf16=True,
    evaluation_strategy="steps",
    max_length=2048,
)


trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

trainer.train()
trainer.model.save_pretrained("./reward_model")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
387,0.6158,0.606708,0.659803
774,0.6041,0.592997,0.673679
1161,0.5507,0.582989,0.692032
1548,0.5866,0.576814,0.696956
1935,0.5836,0.574559,0.695166
2322,0.5506,0.573159,0.706804
2709,0.5175,0.577948,0.704566
3096,0.531,0.574472,0.708594
3483,0.5016,0.575919,0.699642




TrainOutput(global_step=3867, training_loss=0.5715744896763697, metrics={'train_runtime': 6954.9127, 'train_samples_per_second': 17.801, 'train_steps_per_second': 0.556, 'total_flos': 0.0, 'train_loss': 0.5715744896763697, 'epoch': 3.0})