In [1]:
PREF_TRAIN = "/kaggle/input/prefdataset/preference_train.csv"
PREF_TEST = "/kaggle/input/prefdataset/preference_test.csv"
OUTPUT_PATH = "/kaggle/working/"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertTokenizer, BertModel
import pandas as pd
import os
from tqdm import tqdm

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
DEVICE

'cuda'

# Task A: Implementing RLHF

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class PreferenceDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['Question']
        more_pref = self.data.iloc[idx]['More_Prefered']
        less_pref = self.data.iloc[idx]['Less_Prefered']

        more_enc = self.tokenizer(question, more_pref, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        less_enc = self.tokenizer(question, less_pref, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        return {k: v.squeeze(0) for k, v in more_enc.items()}, {k: v.squeeze(0) for k, v in less_enc.items()}

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class RewardModel(nn.Module):
    def __init__(self):
        super(RewardModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

In [7]:
def bradley_terry_loss(more_scores, less_scores):
    return -torch.mean(torch.log(torch.sigmoid(more_scores - less_scores)))

In [None]:
BATCH_SIZE = 8

def train_reward_model():
    # Load dataset
    train_dataset = PreferenceDataset(PREF_TRAIN, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = RewardModel().to(DEVICE)
    model = nn.DataParallel(model)  # Ensure multi-GPU usage
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)

    model.train()
    for epoch in range(3): # Training for 3 epochs
        epoch_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

        for batch in progress_bar:
            optimizer.zero_grad()

            more_enc, less_enc = batch
            more_ids, more_mask = more_enc["input_ids"].squeeze(1).to(DEVICE), more_enc["attention_mask"].to(DEVICE)
            less_ids, less_mask = less_enc["input_ids"].squeeze(1).to(DEVICE), less_enc["attention_mask"].to(DEVICE)

            more_scores = model(more_ids, more_mask)
            less_scores = model(less_ids, less_mask)

            loss = bradley_terry_loss(more_scores, less_scores)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1} completed, Average Loss: {epoch_loss / len(train_loader)}")

    rewardmodel_path = os.path.join(OUTPUT_PATH, "reward_model.pt")
    torch.save(model.state_dict(), rewardmodel_path)
    print("Model saved at", rewardmodel_path)

In [9]:
train_reward_model()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1:   0%|          | 0/3000 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:   0%|          | 2/3000 [00:03<1:19:53,  1.60s/it, loss=0.632]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:   0%|          | 3/3000 [00:04<1:06:29,  1.33s/it, loss=0.567]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:   0%|          | 4/3000 [00:05<1:00:04,  1.20s/it, loss=0.513]Be aware, overflowing tokens are not returned for the settin

Epoch 1 completed, Average Loss: 0.026975840042598207


Epoch 2:   0%|          | 0/3000 [00:00<?, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:   0%|          | 4/3000 [00:04<55:16,  1.11s/it, loss=8.33e-5]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:   0%|          | 5/3000 [00:05<55:03,  1.10s/it, loss=0.000103]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:   0%|          | 7/3000 [00:07<54:34,  1.09s/it, loss=3.23e-5]Be aware, overflowing tokens are not returned for the setti

Epoch 2 completed, Average Loss: 0.030998414226095755


Epoch 3:   0%|          | 4/3000 [00:04<54:54,  1.10s/it, loss=4.17e-7]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3:   0%|          | 5/3000 [00:05<54:05,  1.08s/it, loss=2.07e-6]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3:   0%|          | 6/3000 [00:06<54:08,  1.09s/it, loss=1.83e-6]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3:   0%|          | 8/3000 [00:08<54:55,  1.10s/it, loss=0.000223]Be aware, overflowing tokens are not 

Epoch 3 completed, Average Loss: 0.41132305220928783
Model saved at /kaggle/working/reward_model.pt


## Using PPO to finetune model using the reward model

In [None]:
REWARD_MODEL_PATH = "/kaggle/input/reward_model/pytorch/default/1/reward_model.pt"
OUTPUT_MODEL_PATH = "/kaggle/working/rlhf_trained.pt"

In [None]:
%pip install trl==0.9.4

Collecting trl==0.9.4
  Downloading trl-0.9.4-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl==0.9.4)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.9.4)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.9.4-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.9.16-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.1 trl-0.9.4 tyro-0.9.16


In [None]:
from transformers import GPT2Tokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class PreferenceDatasetForPPO(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['Question']
        encodings = self.tokenizer(question, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return encodings["input_ids"].squeeze(0), encodings["attention_mask"].squeeze(0)

In [None]:
BATCH_SIZE = 2

def get_dataloader(file_path):
    dataset = PreferenceDatasetForPPO(file_path, tokenizer, max_length=256)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True,  pin_memory=True)

In [9]:
policy_model = AutoModelForCausalLMWithValueHead.from_pretrained("openai-community/gpt2-medium").to(DEVICE)
reference_model = AutoModelForCausalLMWithValueHead.from_pretrained("openai-community/gpt2-medium").to(DEVICE)
reference_model.eval()

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

  state_dict = loading_func(filename if not use_safe else safe_filename, **load_kwargs)


AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 1024)
      (wpe): Embedding(1024, 1024)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-23): 24 x GPT2Block(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=3072, nx=1024)
            (c_proj): Conv1D(nf=1024, nx=1024)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=4096, nx=1024)
            (c_proj): Conv1D(nf=1024, nx=4096)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (

In [None]:
reward_model = RewardModel().to(DEVICE)
reward_model.load_state_dict(torch.load(REWARD_MODEL_PATH, map_location=DEVICE), strict=False)
reward_model.eval()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  reward_model.load_state_dict(torch.load(REWARD_MODEL_PATH, map_location=DEVICE), strict=False)


RewardModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [11]:
config = PPOConfig(
    model_name="openai-community/gpt2-medium",
    learning_rate=5e-6,
    batch_size=BATCH_SIZE,
    mini_batch_size=2,
)

ppo_trainer = PPOTrainer(
    config,
    policy_model,
    reference_model,
    tokenizer
)



In [None]:
def train_ppo():
    train_loader = get_dataloader(PREF_TRAIN)

    for epoch in range(1):
        print(f"Epoch {epoch+1}/3")
        progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")
        for input_ids, attention_mask in progress_bar:
            input_ids, attention_mask = input_ids.to(DEVICE), attention_mask.to(DEVICE)

            responses = policy_model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=30)
            responses_text = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]

            response_encodings = bert_tokenizer(responses_text, padding=True, truncation=True, max_length=512, return_tensors='pt')
            response_input_ids, response_attention_mask = response_encodings["input_ids"].to(DEVICE), response_encodings["attention_mask"].to(DEVICE)
            with torch.no_grad():
                logits = reward_model(response_input_ids, response_attention_mask)
                rewards = logits.squeeze(-1).cpu().numpy().tolist()

            queries_list = [q for q in input_ids]
            responses_list = [r[-30:] for r in responses]
            rewards_list = [torch.tensor(r).to(DEVICE) for r in rewards]

            loss = ppo_trainer.step(queries_list, responses_list, rewards_list)
            progress_bar.set_postfix(loss=loss.get('ppo/loss/total'))

    policy_model.save_pretrained(OUTPUT_MODEL_PATH)
    print("PPO-trained model saved at", OUTPUT_MODEL_PATH)

In [None]:
train_ppo()

Epoch 1/3


Training Epoch 1:   0%|          | 0/12000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 1/12000 [00:03<13:19:41,  4.00s/it, loss=1.29]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 2/12000 [00:06<11:01:04,  3.31s/it, loss=0.483]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 3/12000 [00:09<10:18:02,  3.09s/it, loss=0.733]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 4/12000 [00:12<9:56:29,  2.98s/it, loss=0.364] Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 5/12000 [00:15<9:45:18,  2.93s/it, loss=0.128]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Training Epoch 1:   0%|          | 6/12000 [00:18<9:42:51,  2.92s/it, loss=0.157]Setting `pad_token_i

PPO-trained model saved at /kaggle/working/rlhf_trained.pt


# Task B: Implementing DPO

In [None]:
from transformers import GPT2LMHeadModel

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [None]:
class DPOModel(nn.Module):
    def __init__(self):
        super(DPOModel, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-medium").to(torch.float32)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        return outputs.logits

In [None]:
def dpo_loss(preferred_logits, less_preferred_logits, reference_preferred_logits, reference_less_preferred_logits, beta):
    # Compute log probabilities safely
    preferred_log_probs = torch.log_softmax(preferred_logits, dim=-1).sum(dim=-1).clamp(min=-1e9, max=1e9)
    less_preferred_log_probs = torch.log_softmax(less_preferred_logits, dim=-1).sum(dim=-1).clamp(min=-1e9, max=1e9)
    ref_preferred_log_probs = torch.log_softmax(reference_preferred_logits, dim=-1).sum(dim=-1).clamp(min=-1e9, max=1e9)
    ref_less_preferred_log_probs = torch.log_softmax(reference_less_preferred_logits, dim=-1).sum(dim=-1).clamp(min=-1e9, max=1e9)

    # Compute the DPO loss
    logit_diff = preferred_log_probs - less_preferred_log_probs - (ref_preferred_log_probs - ref_less_preferred_log_probs)

    # Prevent extreme values from causing NaN
    logit_diff = logit_diff.clamp(min=-10, max=10)

    return -torch.mean(torch.log(torch.sigmoid(beta * logit_diff) + 1e-9))

In [None]:
DPO_BATCH_SIZE = 2
BETA = 0.1

def train_dpo():
    train_dataset = PreferenceDataset(PREF_TRAIN, tokenizer, max_length=256)
    train_loader = DataLoader(train_dataset, batch_size=DPO_BATCH_SIZE, shuffle=True)

    model = DPOModel().to(DEVICE)
    reference_model = DPOModel().to(DEVICE)
    reference_model.eval()  # Freeze the reference model

    optimizer = optim.AdamW(model.parameters(), lr=5e-5)

    model.train()
    for epoch in range(3):  # Training for 3 epochs
        epoch_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/3")

        for batch in progress_bar:
            optimizer.zero_grad()

            more_enc, less_enc = batch
            more_ids, more_mask = more_enc["input_ids"].to(DEVICE), more_enc["attention_mask"].to(DEVICE)
            less_ids, less_mask = less_enc["input_ids"].to(DEVICE), less_enc["attention_mask"].to(DEVICE)

            more_logits = model(more_ids, more_mask)
            less_logits = model(less_ids, less_mask)
            ref_more_logits = reference_model(more_ids, more_mask).detach()
            ref_less_logits = reference_model(less_ids, less_mask).detach()

            loss = dpo_loss(more_logits, less_logits, ref_more_logits, ref_less_logits, BETA)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}/3 completed, Average Loss: {epoch_loss / len(train_loader)}")

    dpo_path = os.path.join(OUTPUT_PATH, "dpo_trained.pt")
    torch.save(model.state_dict(), dpo_path)
    print("Model saved at", dpo_path)

In [15]:
train_dpo()

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/3: 100%|██████████| 12000/12000 [3:43:54<00:00,  1.12s/it, loss=1.07]


Epoch 1/3 completed, Average Loss: 0.8150493829300006


Epoch 2/3: 100%|██████████| 12000/12000 [3:43:48<00:00,  1.12s/it, loss=0.487]


Epoch 2/3 completed, Average Loss: 0.7396701404800018


Epoch 3/3: 100%|██████████| 12000/12000 [3:43:59<00:00,  1.12s/it, loss=0.823]


Epoch 3/3 completed, Average Loss: 0.7730784021044771
Model saved at /kaggle/working/dpo_trained.pt


# Task C: Performance Comparision

In [None]:
rlhf_model_path = "/kaggle/input/rlhf_trained/pytorch/default/1/rlhf_trained.pt"
dpo_model_path = "/kaggle/input/dpo_trained/pytorch/default/1/dpo_trained.pt"

In [None]:
%pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=bfd24515e5a9d5316974b489ecd6c4a7e857eaf03d270c7124f445c1a195b13c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import time

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [6]:
rlhf_model = GPT2LMHeadModel.from_pretrained("gpt2-medium").to(DEVICE)
rlhf_model.load_state_dict(torch.load(rlhf_model_path, map_location=DEVICE))
rlhf_model.eval()

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  rlhf_model.load_state_dict(torch.load(rlhf_model_path, map_location=DEVICE))


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [None]:
class DPOModelForEval(DPOModel):
    def __init__(self):
        super(DPOModelForEval, self).__init__()

    def generate(self, input_ids, **kwargs):
        """Ensure DPOModel can generate responses like GPT-2"""
        return self.gpt2.generate(input_ids, **kwargs)

# 1. Recreate the DPOModel architecture
dpo_model = DPOModelForEval().to(DEVICE)

# 2. Load the trained weights
state_dict = torch.load(dpo_model_path, map_location=DEVICE)
dpo_model.load_state_dict(state_dict)

# 3. Move model to GPU (if available)
dpo_model.eval()

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  state_dict = torch.load(dpo_model_path, map_location=DEVICE)


DPOModel(
  (gpt2): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 1024)
      (wpe): Embedding(1024, 1024)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-23): 24 x GPT2Block(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=3072, nx=1024)
            (c_proj): Conv1D(nf=1024, nx=1024)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=4096, nx=1024)
            (c_proj): Conv1D(nf=1024, nx=4096)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=1024, ou

In [8]:
test_df = pd.read_csv(PREF_TEST)
questions = test_df["Question"].tolist()
references = test_df["More_Prefered"].tolist()

In [9]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

In [10]:
print(len(questions))

6000


In [None]:
def generate_response(model, question):
    """Generate a response from a model given a question"""
    input_ids = tokenizer(question, return_tensors="pt").input_ids.to(DEVICE)
    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=30)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def evaluate_models():
    rlhf_bleu_scores, dpo_bleu_scores = [], []
    rlhf_rouge_scores, dpo_rouge_scores = [], []

    rlhf_time, dpo_time = 0, 0  # Track computation cost

    for question, reference in tqdm(zip(questions, references), total=len(questions), desc="Evaluating Models"):
        # RLHF model inference
        start_time = time.time()
        rlhf_response = generate_response(rlhf_model, question)
        rlhf_time += time.time() - start_time

        # DPO model inference
        start_time = time.time()
        dpo_response = generate_response(dpo_model, question)
        dpo_time += time.time() - start_time

        # BLEU score
        reference_tokens = reference.split()
        rlhf_bleu_scores.append(sentence_bleu([reference_tokens], rlhf_response.split()))
        dpo_bleu_scores.append(sentence_bleu([reference_tokens], dpo_response.split()))

        # ROUGE score
        rlhf_rouge_scores.append(scorer.score(reference, rlhf_response))
        dpo_rouge_scores.append(scorer.score(reference, dpo_response))

    # Compute average BLEU and ROUGE scores
    avg_rlhf_bleu = sum(rlhf_bleu_scores) / len(rlhf_bleu_scores)
    avg_dpo_bleu = sum(dpo_bleu_scores) / len(dpo_bleu_scores)

    avg_rlhf_rouge = {k: sum(d[k].fmeasure for d in rlhf_rouge_scores) / len(rlhf_rouge_scores) for k in rlhf_rouge_scores[0]}
    avg_dpo_rouge = {k: sum(d[k].fmeasure for d in dpo_rouge_scores) / len(dpo_rouge_scores) for k in dpo_rouge_scores[0]}

    avg_rlhf_time = rlhf_time / len(questions)
    avg_dpo_time = dpo_time / len(questions)

    # Print results
    print("\n🔹 Performance Comparison 🔹")
    print(f"Sample Efficiency (Avg Inference Time per Question)")
    print(f"  RLHF: {avg_rlhf_time:.4f} sec | DPO: {avg_dpo_time:.4f} sec")
    print("\nResponse Quality (Higher is better)")
    print(f"  BLEU Score  - RLHF: {avg_rlhf_bleu:.4f} | DPO: {avg_dpo_bleu:.4f}")
    print(f"  ROUGE Scores:")
    print(f"    ROUGE-1   - RLHF: {avg_rlhf_rouge['rouge1']:.4f} | DPO: {avg_dpo_rouge['rouge1']:.4f}")
    print(f"    ROUGE-2   - RLHF: {avg_rlhf_rouge['rouge2']:.4f} | DPO: {avg_dpo_rouge['rouge2']:.4f}")
    print(f"    ROUGE-L   - RLHF: {avg_rlhf_rouge['rougeL']:.4f} | DPO: {avg_dpo_rouge['rougeL']:.4f}")

In [12]:
evaluate_models()

Evaluating Models:   0%|          | 0/6000 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Evaluating Models:   0%|          | 1/6000 [00:01<2:49:25,  1.69s/it]The attention mask and the p


🔹 Performance Comparison 🔹
Sample Efficiency (Avg Inference Time per Question)
  RLHF: 0.4513 sec | DPO: 0.4511 sec

Response Quality (Higher is better)
  BLEU Score  - RLHF: 0.0880 | DPO: 0.0880
  ROUGE Scores:
    ROUGE-1   - RLHF: 0.1300 | DPO: 0.1293
    ROUGE-2   - RLHF: 0.0280 | DPO: 0.0279
    ROUGE-L   - RLHF: 0.0856 | DPO: 0.0852



