In [None]:
# 1. Imports
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import evaluate
import numpy as np
import torch
import torch.nn as nn
from datasets import load_dataset, Dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerBase,
)
import pandas as pd

# Load evaluation metric
accuracy = evaluate.load("accuracy")




In [None]:
model_name_split = "gpt2"
output_name = "gpt2_RM02"

# Initialize the tokenizer and set the padding token
tokenizer = AutoTokenizer.from_pretrained(model_name_split)
tokenizer.pad_token = tokenizer.eos_token  # Set EOS token as padding token

# Define the PEFT configuration for LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

# Load the model and apply PEFT (LoRA)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_split, num_labels=1, torch_dtype=torch.bfloat16
)
model = get_peft_model(model, peft_config)

# Set padding token in model configuration
model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = not False  # Disable caching for compatibility


In [None]:
# Load your dataset
train_df = pd.read_csv('datasets/rewardModelTrain.csv')
dataset = Dataset.from_pandas(train_df)

# Split the dataset into training and validation
train_val_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

# Define preprocessing function
def preprocess_function(examples):
    new_examples = {
        "input_ids_j": [],
        "attention_mask_j": [],
        "input_ids_k": [],
        "attention_mask_k": [],
    }
    for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["output_1"]):
        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + str(response_j), truncation=True)
        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + str(response_k), truncation=True)

        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

    return new_examples

# Apply preprocessing
num_proc = 24  # Adjust based on your machine's capability
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=train_dataset.column_names,
)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= 512 and len(x["input_ids_k"]) <= 512
)
eval_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=val_dataset.column_names,
)


In [None]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str] = True
    max_length: Optional[int] = 512
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch


In [None]:
class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss


In [None]:
def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=output_name,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    num_train_epochs=3,  # Increase the number of epochs if needed
    weight_decay=0.001,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    deepspeed=None,
    local_rank=-1,
    remove_unused_columns=False,  # Important for custom inputs
    bf16=True,
    logging_strategy="steps",
    logging_steps=10,
    optim="adamw_hf",
    lr_scheduler_type="linear",
)

# Initialize the trainer
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=512),
)

# Start training
trainer.train(False)

# Save the trained model
model.save_pretrained(output_name + "_peft_last_checkpoint")
trainer.model.save_pretrained("reward_model")


In [None]:
from transformers import pipeline
from accelerate import Accelerator

current_device = Accelerator().local_process_index
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=model,
    device_map={"": current_device},
    model_kwargs={"load_in_8bit": True},
    tokenizer=tokenizer,
    return_token_type_ids=False,
)

device = torch.device("cuda")
text = train_df['question'].iloc[161] + train_df['output_1'].iloc[161]  
inputs = tokenizer(str(text), return_tensors="pt", padding=True, truncation=True, max_length=512)
input_ids = inputs["input_ids"].to(device)

pipe_outputs = sentiment_pipe(text)
reward_baseline = 0.5  # Example value, adjust as needed

rewards = [torch.tensor(output["score"] - reward_baseline) for output in pipe_outputs]
rewards
