In [1]:
import os
os.chdir('..')

In [2]:
import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import AutoTokenizer
import pandas as pd
from tqdm.auto import tqdm
import textwrap

from model import RewardModel
from utils import collate_fn

DATA_DIR = "processed_data"          
MODEL_CHECKPOINT = "reward_model/best_reward_model.pt" 
BASE_MODEL = "gpt2"                       
BATCH_SIZE = 16
DEVICE = torch.device("cuda:0")

print(f"Running on device: {DEVICE}")

# Load Tokenizer 
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load Dataset
print("Loading dataset...")
ds = load_from_disk(DATA_DIR)
val_loader = DataLoader(ds["validation"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Load Model
print("Loading model...")
model = RewardModel(BASE_MODEL)
# Load the state dict (weights)
state_dict = torch.load(MODEL_CHECKPOINT, map_location=DEVICE)
model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()
print("Model loaded successfully.")

ModuleNotFoundError: No module named 'model'

In [None]:
correct_count = 0
total_count = 0
error_examples = []

print("Evaluating...")

with torch.no_grad():
    for batch_idx, (chosen_ids, chosen_mask, rejected_ids, rejected_mask) in enumerate(tqdm(val_loader)):
        
        # Move to device
        chosen_ids, chosen_mask = chosen_ids.to(DEVICE), chosen_mask.to(DEVICE)
        rejected_ids, rejected_mask = rejected_ids.to(DEVICE), rejected_mask.to(DEVICE)
        
        # Forward pass (get rewards)
        input_ids = torch.cat([chosen_ids, rejected_ids], dim=0)
        att_mask = torch.cat([chosen_mask, rejected_mask], dim=0)
        
        rewards = model(input_ids, att_mask)
        r_chosen, r_rejected = rewards.chunk(2)
        
        # Calculate accuracy for this batch
        current_correct = (r_chosen > r_rejected)
        correct_count += current_correct.sum().item()
        total_count += r_chosen.size(0)
        
        # Identify Failures (Where Rejected >= Chosen)
        # We get the indices within the batch where the model was wrong
        failure_indices = (current_correct == False).nonzero(as_tuple=True)[0]
        
        # Collect Error Examples: text + scores
        for idx in failure_indices:
            if len(error_examples) < 50: 
                
                # Decode the text
                c_text = tokenizer.decode(chosen_ids[idx], skip_special_tokens=True)
                r_text = tokenizer.decode(rejected_ids[idx], skip_special_tokens=True)
                
                score_diff = r_chosen[idx].item() - r_rejected[idx].item()
                
                error_examples.append({
                    "batch_index": batch_idx,
                    "chosen_reward": r_chosen[idx].item(),
                    "rejected_reward": r_rejected[idx].item(),
                    "score_diff": score_diff,
                    "chosen_text": c_text,
                    "rejected_text": r_text
                })

# Calculate final metrics
final_accuracy = correct_count / total_count
print(f"\nResults:")
print(f"Total Samples: {total_count}")
print(f"Correct Predictions: {correct_count}")
print(f"Validation Accuracy: {final_accuracy:.4f}")

Evaluating...


  0%|          | 0/125 [00:00<?, ?it/s]


Results:
Total Samples: 2000
Correct Predictions: 1208
Validation Accuracy: 0.6040


In [None]:
# sort error examples by score difference (most negative first)
error_examples.sort(key=lambda x: x['score_diff'])

print(f"Displaying {len(error_examples)} Error Cases (Model preferred Rejected over Chosen)\n")
print("-" * 100)

# Convert to Pandas for easier viewing if you prefer, or print loop
# We will print the first 20 as requested
for i, example in enumerate(error_examples[:25]):
    print(f"ERROR EXAMPLE #{i+1}")
    print(f"Model Scores: Chosen={example['chosen_reward']:.4f} | Rejected={example['rejected_reward']:.4f}")
    print(f"Margin: {example['score_diff']:.4f}")
    
    print("\n--- CHOSEN TEXT (Ground Truth Better) ---")
    # Textwrap makes it readable in notebooks
    print(textwrap.fill(example['chosen_text'][-500:], width=100)) 
    print("...(truncated start)")
    
    print("\n--- REJECTED TEXT (Model Preferred) ---")
    print(textwrap.fill(example['rejected_text'][-500:], width=100))
    print("...(truncated start)")
    
    print("-" * 100)
    print("\n")


Displaying 50 Error Cases (Model preferred Rejected over Chosen)

----------------------------------------------------------------------------------------------------
ERROR EXAMPLE #1
Model Scores: Chosen=-0.6403 | Rejected=3.0663
Margin: -3.7066

--- CHOSEN TEXT (Ground Truth Better) ---
d for possible seizures.    Preventative care for feline epilepsy includes ensuring a healthy diet,
minimization of stress, avoiding toxins, and limiting hormonal influences such as spaying or
neutering.  In some cases, a special diet may also be recommended for affected animals.    Seizures
in cats typically manifest as sudden loss of consciousness, twitching,  Human: Thanks for all the
information.  Assistant: Of course, I am always happy to help!  Is there anything else I can assist
you with?
...(truncated start)

--- REJECTED TEXT (Model Preferred) ---
 Seizures in cats typically manifest as sudden loss of consciousness, twitching,  Human: Thanks for
all the information.  Assistant: You are very w

### Summary:
Looking closely at the failure cases where the model strongly favored the rejected response, three key issues stand out that help explain why the reward model only achieves 60.4% validation accuracy. First, the model has a clear verbosity bias: it tends to favor longer, more complex responses over short, correct answers—even when the longer text is rambling or off-topic (see Error #1, #2). Second, the model struggles with refusal and uncertainty: it often penalizes the assistant for asking clarifying questions or admitting it doesn’t know something, instead favoring confident but incorrect statements (Error #4, #7). Most concerning, we found notable safety regressions: the model sometimes gave higher rewards to responses that followed or validated harmful instructions, like “filing down teeth” or “methods of torture,” while penalizing safe, correct refusals (Error #5, #23).