In [1]:
'''
Preference / Reward model 
Here it learns to reward the 

** Dataset ** 

Prompt : Why is the color of sky blue ?
Chosen : Its because of the scattering of blue wavelength by air molecules 
Rejected : Because sky likes blue color


The reward model, is a linear layer at top, that learn to output whether this response is good or not 

Input1 to model : {Prompt + Chosen} ,  Output1 : 1  
Input2 to model : {Prompt + Rejected}  , Output2: 0 


Here the model learns to pick up a good response !

This is the logic that happens in PPO loop ! So there is nothing as seperate training for a PPO model directly just use this as a Value head  
'''



# Single runnable cell — minimal, targeted fixes only (copy-paste)
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoConfig,
    PreTrainedModel,
)
from transformers.modeling_outputs import SequenceClassifierOutput
from trl import RewardConfig, RewardTrainer, PreTrainedModelWrapper
from dotenv import load_dotenv
load_dotenv()

# ------------------ USER-CHOICE (preserved) ------------------
BASE_MODEL = "google/gemma-3-270m-it"
import os 
FILE_PATH = os.path.dirname(os.path.abspath(""))
# keep your FILE_PATH variable as-is in your environment
# if it's not defined, set a sensible default (uncomment next line)
# FILE_PATH = "./models"
# -------------------------------------------------------------

# 1) tokenizer (used by RewardTrainer as processing_class)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir=FILE_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Reward model (Trained model )
class RewardModel(PreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(config)
        # preserve your supplied model name if provided; else fall back to config
        self.model_name = kwargs.get("model", getattr(config, "name_or_path", BASE_MODEL))
        # reward head => single scalar
        self.num_labels = kwargs.get("num_labels", getattr(config, "num_labels", 1))

        # load frozen LM backbone
        self.lm_model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=FILE_PATH)
        for p in self.lm_model.parameters():
            p.requires_grad = False

        # replace lm_head with scalar reward head — do NOT pass device here
        in_features = self.lm_model.lm_head.in_features
        self.lm_model.lm_head = nn.Linear(in_features, 1)
        self.classifier = self.lm_model.lm_head
        for p in self.classifier.parameters():
            p.requires_grad = True

    def forward(self, input_ids=None, attention_mask=None, return_dict: bool = True, **kwargs):
        # pass through LM body
        lm_out = self.lm_model.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        last_hidden = lm_out.last_hidden_state        # [B, L, H]
        pooled = last_hidden[:, -1, :]                # last token pooling -> [B, H]
        rewards = self.classifier(pooled)             # [B, 1]
        if not return_dict:
            return (rewards,)
        return SequenceClassifierOutput(logits=rewards)

    # lightweight saving: store only head weights (as you requested earlier)
    def save_pretrained(self, save_directory, **kwargs):
        os.makedirs(save_directory, exist_ok=True)
        torch.save(self.classifier.state_dict(), os.path.join(save_directory, "lm_head.pt"))


# 2) ConvertedModel: PreTrainedModel wrapper that uses frozen causal LM backbone

# # PPO trainer : https://huggingface.co/docs/trl/main/en/trainer#trl.PPOTrainer
# class ValueModel(PreTrainedModelWrapper):
#     def __init__(self, config, **kwargs):
#         # preserve your supplied model name if provided; else fall back to config
#         self.model_name = kwargs.get("model", getattr(config, "name_or_path", BASE_MODEL))
#         # reward head => single scalar
#         self.num_labels = kwargs.get("num_labels", getattr(config, "num_labels", 1))

#         # load frozen LM backbone
#         lm_model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=FILE_PATH)
#         for p in lm_model.parameters():
#             p.requires_grad = False

#         super().__init__(lm_model)
#         self.lm_model = lm_model

#         # replace lm_head with scalar reward head — do NOT pass device here
#         in_features = self.lm_model.lm_head.in_features
#         # self.lm_model.lm_head = nn.Linear(in_features, 1)
#         self.classifier = self.lm_model.lm_head
#         for p in self.classifier.parameters():
#             p.requires_grad = True

#     def forward(self, input_ids=None, attention_mask=None, return_dict: bool = True, **kwargs):
#         # pass through LM body
#         lm_out = self.lm_model.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
#         last_hidden = lm_out.last_hidden_state        # [B, L, H]
#         pooled = last_hidden[:, -1, :]                # last token pooling -> [B, H]
#         rewards = self.classifier(pooled)             # [B, 1]
#         if not return_dict:
#             return (rewards,)
#         return SequenceClassifierOutput(logits=rewards)

#     # lightweight saving: store only head weights (as you requested earlier)
#     def save_pretrained(self, save_directory, **kwargs):
#         os.makedirs(save_directory, exist_ok=True)
#         torch.save(self.classifier.state_dict(), os.path.join(save_directory, "lm_head.pt"))


In [11]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForCausalLM
from transformers.modeling_outputs import ModelOutput
from trl import PreTrainedModelWrapper
BASE_MODEL = "google/gemma-3-270m-it"

class ValueOutput(ModelOutput):
    # convenient container: policy_logits (vocab logits), values (per-token scalar)
    policy_logits: torch.Tensor  # [B, L, V]
    values: torch.Tensor         # [B, L]

class ValueModel(PreTrainedModelWrapper):
    """
    Keeps lm_head (policy logits) and adds a separate value_head (per-token values).
    """
    base_model_prefix = "lm_model"  # required for HF compatibility

    def __init__(self, config, model: str =BASE_MODEL, freeze_backbone: bool = True, **kwargs):
        num_labels = kwargs.get('num_labels', 1)
        lm_model = AutoModelForCausalLM.from_pretrained(model, cache_dir=FILE_PATH)
        # NOTE: we pass the model instance to the wrapper
        super().__init__(lm_model)


        self.lm_model = lm_model
        self.generation_config = self.lm_model.config

        # If you intend to train the policy (PPO), DON'T freeze the policy model here.
        # But the ValueModel is typically a separate object from the policy — freeze its backbone so only
        # the value head trains. If you are using the same model object for policy+value, manage freezing carefully.
        if freeze_backbone:
            for p in self.lm_model.parameters():
                p.requires_grad = False

        hidden_size = getattr(self.lm_model.config, "hidden_size", None) or getattr(self.lm_model.config, "d_model", None)
        if hidden_size is None:
            raise ValueError("Couldn't infer hidden size from model config")

        # value head produces one scalar per token -> [B, L, 1] -> squeeze -> [B, L]

        # self.value_head = nn.Sequential(
        #     nn.Softmax(dim = -1),
        #     self.lm_model.model.embed_tokens(), 
        #     nn.Linear(hidden_size, num_labels),
        # )

        self.value_head = nn.Linear(hidden_size, 1)
        for p in self.value_head.parameters():
            p.requires_grad = True

    def __getattr__(self, name):
        # fallback: delegate to lm_model
        try:
            return super().__getattr__(name)
        except AttributeError:
            return getattr(self.lm_model, name)

    def _get_transformer_body(self):
        # common names; Gemma3 appears to have .model as the body
        for attr in ("base_model", "model", "transformer", "gpt_neox", "decoder"):
            candidate = getattr(self.lm_model, attr, None)
            if candidate is not None:
                return candidate
        return self.lm_model

    def forward(self, input_ids=None, attention_mask=None, return_dict: bool = True, **kwargs):
        """
        Returns:
          - policy_logits: LM vocab logits [B, L, V] (from lm_head applied to last_hidden_state)
          - values: per-token scalar values [B, L]
        Use 'policy_logits' to compute token probabilities, sample actions, compute log-probs etc.
        """
        transformer = self._get_transformer_body()

        # run transformer body only (avoid double heads); returns last_hidden_state [B, L, H]
        outputs = transformer(input_ids=input_ids, attention_mask=attention_mask, return_dict=True, **kwargs)
        last_hidden = getattr(outputs, "last_hidden_state", None)
        if last_hidden is None:
            last_hidden = outputs[0]

        # 1) policy logits via the original lm_head (do NOT replace lm_head)
        #    We call lm_head on last_hidden directly (this is equivalent to lm_model.forward's logits)
        policy_logits = self.lm_model.lm_head(last_hidden)  # [B, L, V]

        # 2) value head -> per-token scalar
        values = self.value_head(last_hidden).squeeze(-1)  # [B, L]

        if not return_dict:
            return (policy_logits, values)

        return ValueOutput(policy_logits=policy_logits, values=values)

    # helper: compute chosen-token log-probs given token ids (useful for PPO loss / KL)
    @staticmethod
    def selected_token_logprobs(policy_logits: torch.Tensor, chosen_tokens: torch.Tensor):
        """
        policy_logits: [B, L, V]
        chosen_tokens:  [B, L]  (token ids for each position; -100 or padding for positions to ignore)
        returns: selected_logprobs [B, L] (log-prob of the chosen token at each position)
        """
        log_probs = F.log_softmax(policy_logits, dim=-1)               # [B, L, V]
        chosen = chosen_tokens.unsqueeze(-1)                           # [B, L, 1]
        selected_logprobs = torch.gather(log_probs, dim=-1, index=chosen).squeeze(-1)  # [B, L]
        return selected_logprobs


In [12]:

# 3) build model from config and move to device (CPU by default)
config = AutoConfig.from_pretrained(BASE_MODEL, num_labels=1, cache_dir=FILE_PATH)
# print('the config is : ', config)
conv_model = RewardModel(config, model=BASE_MODEL, num_labels=1)


print('Initializing the value model below this !!')
value_model= ValueModel(config , model = BASE_MODEL , num_labels=1) 



Initializing the value model below this !!


In [15]:
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from trl import PPOTrainer, PPOConfig
from trl.core import LengthSampler

# pick device

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


value_model.to(device)

# 1) load dataset for PPO training
dataset = load_dataset("Dahoas/rm-static", split="train[:2000]", cache_dir=FILE_PATH)

# Convert into a format PPOTrainer expects
def preprocess(examples):
    return {"query": examples["prompt"]}

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

# Now dataset is a Dataset with {"query": "..."}
print(dataset[0])  # {'query': 'some prompt text'}

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'query': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:'}


In [None]:
from trl import PPOTrainer, PPOConfig
import inspect
import sys
import importlib

# Correctly use importlib to import PPOTrainer
importlib.import_module('trl.trainer.ppo_trainer')

# sys.modules.pop('PPOTrainer', None)


# Print the signature of PPOTrainer in a nicely formatted way
ppo_trainer_signature = inspect.signature(PPOTrainer)


# 2) PPO config
ppo_config = PPOConfig(
    learning_rate=1.41e-5,
    batch_size=16,
    mini_batch_size=4,
    gradient_accumulation_steps=1
)

# 3) reward pipeline (uses your trained conv_model as RM)
def compute_rewards(samples, responses):
    # samples: original prompts
    # responses: generated model responses
    texts = [s + r for s, r in zip(samples, responses)]
    toks = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

    with torch.no_grad():
        scores = conv_model(**toks).logits.squeeze(-1)

    return scores  # shape: (batch,)

# 4) PPO Trainer
ppo_trainer = PPOTrainer(
    model = value_model,
    ref_model = None, 
    args=ppo_config,
    value_model=value_model,
    reward_model = conv_model,
    processing_class=tokenizer,
    train_dataset=dataset,
)

# 5) main training loop
output_min_len = 16
output_max_len = 64
output_length_sampler = LengthSampler(output_min_len, output_max_len)

for epoch, batch in enumerate(ppo_trainer.dataloader):
    queries = batch["query"] if isinstance(batch["query"], list) else [batch["query"]]
    query_tensors = tokenizer(
        queries,
        return_tensors="pt",
        padding=True,
        truncation=True,
    ).input_ids.to(device)
    # query_tensors = tokenizer(batch["query"], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

    # sample responses from the policy (actor)
    response_tensors = ppo_trainer.generate(query_tensors, max_new_tokens=output_length_sampler())
    responses = tokenizer.batch_decode
    (response_tensors, skip_special_tokens=True)

    # get reward scores from reward model
    rewards = compute_rewards(batch["query"], responses)

    # PPO step (policy + value update)
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

print("✅ PPO training loop finished.")


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['query']

In [25]:
from datasets import load_dataset
import torch
from trl import PPOTrainer, PPOConfig
from transformers import AutoTokenizer
from trl.core import LengthSampler
import copy

device = torch.device("cuda")  # or "cuda"

# === 1) dataset: keep it as a HF Dataset and map to a single 'query' field ===
ds = load_dataset("Dahoas/rm-static", split="train[:2000]", cache_dir=FILE_PATH)

def keep_query(examples):
    return {"query": examples["prompt"]}

ds = ds.map(keep_query, remove_columns=ds.column_names)  # now every row: {"query": "..."}
# sanity
print("example:", ds[0])

# === 2) tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# === 3) collator: ALWAYS return a dict with 'query' -> list[str] ===
def collator(batch):
    # batch: list of dicts like [{'query': '...'}, ...]
    queries = [b["query"] for b in batch]
    return {"query": queries}

example: {'query': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:'}


In [None]:


# === 4) instantiate (make sure ref_model exists if your constructor requires it) ===
# if your PPOTrainer signature expects args first (like your custom class), adapt accordingly.
# Typical TRL usage:
ref_model = copy.deepcopy(value_model)
ref_model.to(device)
ref_model.eval()

ppo_trainer = PPOTrainer(
    args=ppo_config,                # or config=ppo_config depending on your class
    processing_class=tokenizer,     # some custom class uses this name
    model=value_model,
    value_model=value_model,
    ref_model=ref_model,            # provide a frozen reference model if required
    reward_model=conv_model,
    train_dataset=ds,               # pass the HF Dataset, NOT ds['query']
    data_collator=collator,
)

# === 5) reward function: conv_model returns scalar per sequence ===
def compute_rewards(queries, responses):
    # queries: list[str], responses: list[str]
    texts = [q + r for q, r in zip(queries, responses)]
    toks = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        scores = conv_model(input_ids=toks["input_ids"], attention_mask=toks["attention_mask"]).logits
        # squeeze if shape [B, L, 1] or [B,1]
        scores = scores.squeeze()
        if scores.dim() == 2:  # shape [B, L] -> reduce if needed (use final token or sum)
            scores = scores[:, -1]  
    return scores.detach().cpu()

# === 6) training loop: be defensive about batch types ===

print("starting the training loop here ")
output_min_len, output_max_len = 16, 64
output_length_sampler = LengthSampler(output_min_len, output_max_len)

if not hasattr(ppo_trainer, "reward_fn"):
    ppo_trainer.reward_fn = compute_rewards
else:
    ppo_trainer.reward_fn = compute_rewards

# Now call push-button train
ppo_trainer.train()   # this should run the full PPO pipeline: generate → reward → step → update


example: {'query': '\n\nHuman: Can you describe the steps to clean fingerprints and smudges from a laptop screen\n\nAssistant: Yes, certainly. To clean your screen, you first need to use a microfiber cloth or soft, damp cloth to gently wipe down the surface of the screen. Next, you’ll want to grab a soft, lint-free, microfiber cleaning cloth and gently rub it back and forth across the screen to remove fingerprints and smudges.\n\nHuman: Can I spray isopropyl alcohol onto the cloth and clean it that way?\n\nAssistant:'}
starting the training loop here 
===training policy===


AttributeError: 'list' object has no attribute 'to'

In [28]:
import copy, inspect, torch
from trl import PPOTrainer
from trl.core import LengthSampler

# device, dataset, tokenizer, value_model, conv_model, ppo_config must already exist
device = torch.device("mps" if torch.backends.mps.is_available() and torch.device("mps") else "cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# quick safety: move models to device
value_model.to(device)
conv_model.to(device)
conv_model.eval()
ref_model = copy.deepcopy(value_model)
ref_model.to(device)
ref_model.eval()

# make sure ds is a HF Dataset with 'query' column
print("example dataset row:", ds[0])

# collator that returns list[str] under 'query'
def collator(batch):
    return {"query": [b["query"] for b in batch]}

# inspect signature and build kwargs
sig = inspect.signature(PPOTrainer.__init__)
params = list(sig.parameters.keys())
print("PPOTrainer.__init__ params:", params)

kw = {}
# config / args
if "config" in params:
    kw["config"] = ppo_config
elif "args" in params:
    kw["args"] = ppo_config
else:
    raise RuntimeError("Can't find 'config' or 'args' in PPOTrainer signature; inspect and adapt.")

# tokenizer / processing_class
if "tokenizer" in params:
    kw["tokenizer"] = tokenizer
elif "processing_class" in params:
    kw["processing_class"] = tokenizer

# required models
if "model" in params:
    kw["model"] = value_model  # often the actor/policy (if your trainer expects a separate value_model, we set that below)
if "ref_model" in params:
    kw["ref_model"] = ref_model
if "reward_model" in params:
    kw["reward_model"] = conv_model
# IMPORTANT: some local/trl forks expect a separate 'value_model' parameter
if "value_model" in params:
    kw["value_model"] = value_model

# dataset param name
if "dataset" in params:
    kw["dataset"] = ds
elif "train_dataset" in params:
    kw["train_dataset"] = ds

# data collator
if "data_collator" in params:
    kw["data_collator"] = collator

print("Instantiating PPOTrainer with kwargs:", list(kw.keys()))
ppo_trainer = PPOTrainer(**kw)

# attach a reward function (safe)
def compute_rewards(queries, responses):
    texts = [q + r for q, r in zip(queries, responses)]
    toks = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        out = conv_model(input_ids=toks["input_ids"], attention_mask=toks["attention_mask"])
    scores = getattr(out, "logits", out)
    scores = torch.as_tensor(scores).squeeze()
    if scores.dim() == 2:
        if scores.shape[1] == 1:
            scores = scores[:, 0]
        else:
            scores = scores[:, -1]
    return scores.cpu()

ppo_trainer.reward_fn = compute_rewards

print("Calling ppo_trainer.train() ...")
ppo_trainer.train()


device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 640.00 MiB. GPU 0 has a total capacity of 7.63 GiB of which 516.44 MiB is free. Including non-PyTorch memory, this process has 7.10 GiB memory in use. Of the allocated memory 6.99 GiB is allocated by PyTorch, and 8.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)