# **End-to-End Reinforcement Learning with Human Feedback: Reward Modeling and PPO Training**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



## **Load Libraries and configuration of the models**


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, pipeline
from trl import (
    AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer,
    RewardConfig, RewardTrainer, setup_chat_format, ModelConfig
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score

In [4]:
# Set model name and device (use GPU if available)
model_name = "gpt2"  # Base model name
# Use CUDA if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model configuration
model_config = ModelConfig(model_name_or_path=model_name)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_config.model_name_or_path,
    trust_remote_code=model_config.trust_remote_code,
    use_fast=True
)
# Ensure EOS token is used for padding
tokenizer.pad_token = tokenizer.eos_token

# Load models for different purposes: Classification, Reward, Value, Policy
model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, num_labels=1, trust_remote_code=model_config.trust_remote_code
)
# Reward model for reinforcement learning
reward_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
# Value model for estimating state values
value_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
# Policy model for generating actions
policy_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# Reference policy model for comparison
ref_policy_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

## **Data Preparation**

In [5]:
# Load the ultrafeedback_binarized dataset from the Hugging Face Hub
dataset = load_dataset("trl-lib/ultrafeedback_binarized")

# Define a function to tokenize and preprocess the dataset
def tokenize_function(examples):
    """
    Tokenizes and preprocesses the input examples.

    Args:
        examples: A dictionary of examples containing the "chosen" key.

    Returns:
        A dictionary containing the tokenized input IDs, attention mask, etc.
    """
    return tokenizer(
        [str(x) for x in examples["chosen"]],
        truncation=True,  # Truncate sequences longer than max_length
        padding="max_length",  # Pad shorter sequences to max_length
        max_length=min(tokenizer.model_max_length, 200),  # Set max sequence length (reduced for CUDA memory)
        return_tensors="pt"  # Return PyTorch tensors
    )

# Apply the tokenize_function to the dataset using the `map` function
tokenized_dataset = dataset.map(
    tokenize_function,  # The function to apply
    batched=True,  # Process examples in batches for efficiency
    num_proc=4,  # Use 4 processes for parallel processing
)

README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/62135 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(dataset) # Shows the features of the original dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
        num_rows: 62135
    })
    test: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
        num_rows: 1000
    })
})


In [None]:
print(tokenized_dataset)  # Shows the features of the tokenized dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected', 'input_ids', 'attention_mask'],
        num_rows: 62135
    })
    test: Dataset({
        features: ['chosen', 'rejected', 'score_chosen', 'score_rejected', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [6]:
# Save the tokenizer
tokenizer.save_pretrained("drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer")

('drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/tokenizer_config.json',
 'drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/special_tokens_map.json',
 'drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/vocab.json',
 'drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/merges.txt',
 'drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/added_tokens.json',
 'drive/MyDrive/M2 D3S/Math of DL/Project/tokenizer/tokenizer.json')

## **Train the Reward Model**

In [None]:
# Set up chat format for the tokenizer and model if it's not already configured
if tokenizer.chat_template is None:
    # This ensures the tokenizer and model are compatible for chat-style interactions
    model, tokenizer = setup_chat_format(model, tokenizer)

# Configure training arguments for the reward model using RewardConfig
reward_config = RewardConfig(
    output_dir="drive/MyDrive/M2 D3S/Math of DL/Project",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    logging_steps=25,  # Log training progress every 25 steps
    eval_strategy="steps",  # Evaluation strategy: evaluate every 'eval_steps'
    eval_steps=50,  # Evaluate every 50 steps
    remove_unused_columns=False,
    dataset_num_proc=4,
    report_to="none"  # Disable reporting to external tracking services
)

# Define a function to evaluate the model after the training
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Initialize the RewardTrainer with the configuration, model, tokenizer, and datasets
trainer = RewardTrainer(
    args=reward_config,  # Training arguments
    model=model,  # The model to train
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics,
)

# Start training the reward model
print("Training the reward model...")
trainer.train()
metrics = trainer.evaluate()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = RewardTrainer(


Map:   0%|          | 0/62135 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/62135 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1942 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1096 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2164 > 1024). Running this sequence through the model will result in indexing errors


Filter (num_proc=4):   0%|          | 0/62135 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1366 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2495 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1100 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1187 > 1024). Running this sequence through the model will result in indexing errors


Filter (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Training the reward model...


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
50,0.724,0.693147,0.49697
100,0.7136,0.693147,0.5
150,0.7105,0.693147,0.493631
200,0.7018,0.693147,0.444079
250,0.6931,0.693147,0.524752
300,0.7005,0.693147,0.52669
350,0.7084,0.693147,0.457338
400,0.6876,0.693147,0.512111
450,0.7039,0.693147,0.470803
500,0.6919,0.693147,0.516892








































































































































Step,Training Loss,Validation Loss,Accuracy
50,0.724,0.693147,0.49697
100,0.7136,0.693147,0.5
150,0.7105,0.693147,0.493631
200,0.7018,0.693147,0.444079
250,0.6931,0.693147,0.524752
300,0.7005,0.693147,0.52669
350,0.7084,0.693147,0.457338
400,0.6876,0.693147,0.512111
450,0.7039,0.693147,0.470803
500,0.6919,0.693147,0.516892




































TrainOutput(global_step=4214, training_loss=0.6972300432618896, metrics={'train_runtime': 8527.7458, 'train_samples_per_second': 3.953, 'train_steps_per_second': 0.494, 'total_flos': 0.0, 'train_loss': 0.6972300432618896, 'epoch': 1.0})

In [None]:
# Model evaluation
trainer.log_metrics("eval", metrics)

***** eval metrics *****
  eval_accuracy               =     0.7473
  eval_loss                   =     0.6931
  eval_model_preparation_time =     0.0039
  eval_runtime                = 0:00:30.35
  eval_samples_per_second     =     18.121
  eval_steps_per_second       =      2.273


In [None]:
# Save the model
reward_model.save_pretrained("drive/MyDrive/M2 D3S/Math of DL/Project/reward_model")

**Reward Model Performance**

The reward model achieved the following evaluation results:

- **`eval_loss`**: **0.6931**  
  This represents the cross-entropy loss during evaluation. A lower loss indicates better performance, but the ideal value depends on the specific task and dataset.

- **`eval_accuracy`**: **0.7473**  
  The model correctly classified **74.73%** of the evaluation samples. While this is a good starting point, there's still room for improvement.

- **`eval_runtime`**, **`eval_samples_per_second`**, **`eval_steps_per_second`**  
  These metrics relate to the **efficiency** of the evaluation process and are not directly tied to the model's performance.

---
## Some possible Improvements

- **Increasing the Number of Epochs**: Improves accuracy but comes at the cost of longer training times.
- **Fine-Tuning hyperparameters**: Adjust PPO parameters iteratively while monitoring performance.
- **Augmenting Training Data**: Effective only if the data is relevant, clean, and free of bias.
- **Modifying Reward Functions**: Significantly affects training outcomes and must align with the **downstream application** of your model (e.g., chatbot).


## **Train the Policy Model with PPO**

In [None]:
# Configure PPO training parameters using PPOConfig
ppo_config = PPOConfig(
    num_train_epochs=1,
    gradient_accumulation_steps=2,
    batch_size=1,
    mini_batch_size=1,  # Mini-batch size for gradient updates within each batch
    learning_rate=1.41e-5, #This lr will be updated through the learning
    output_dir="drive/MyDrive/M2 D3S/Math of DL/Project/PPO_results",
    eval_strategy="steps",  # Evaluate every 'eval_steps'
    eval_steps=50,
    report_to="none"  # Disable reporting to external tracking services
)

# Select 10% of the original training dataset for faster training and apply
# tokenization to the training and evaluation dataset
train_dataset = tokenized_dataset["train"].select(range(int(len(tokenized_dataset["train"]) * 0.1)))
train_dataset = train_dataset.with_format("torch", columns=['input_ids', 'attention_mask'])
eval_dataset = tokenized_dataset["test"].map(tokenize_function, batched=True, num_proc=4)
eval_dataset = eval_dataset.with_format("torch", columns=['input_ids', 'attention_mask'])

# Initialize the PPOTrainer with the configuration, models, and datasets
trainer = PPOTrainer(
    ppo_config,
    reward_model=reward_model,  # Reward model for calculating rewards
    tokenizer=tokenizer,
    policy=policy_model,  # Policy model to be optimized
    ref_policy=ref_policy_model,  # Reference policy model for comparison
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    value_model=value_model,  # Value model for estimating state values
)

# Create a learning rate scheduler to optimize the lr
num_update_steps_per_epoch = len(trainer.train_dataset) // ppo_config.gradient_accumulation_steps
num_training_steps = ppo_config.num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=trainer.optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Override the trainer's learning rate scheduler
trainer.lr_scheduler = lr_scheduler

# Start PPO training
trainer.train()

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

  return func(*args, **kwargs)
  return func(*args, **kwargs)
  return func(*args, **kwargs)


===training policy===


Step,Training Loss,Validation Loss


In [None]:
# Save the optimized PPO model
policy_model.save_pretrained("drive/MyDrive/M2 D3S/Math of DL/Project/ppo_optimized")