In [None]:
#!pip install -q "transformers" "datasets" "peft" safetensors

from google.colab import drive
drive.mount("/content/drive")

import os, json, torch, torch.nn as nn, torch.nn.functional as F
from datasets import load_from_disk, concatenate_datasets
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorWithPadding
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from peft import PeftModel

device = "cuda"
DTYPE = torch.bfloat16
PROJECT_PATH   = "/content/drive/MyDrive/prm_project/run-2"
BASE_MODEL_NAME = "Qwen/Qwen3-8B"
MAX_SEQ_LENGTH = 384

In [None]:
CKPT_DIR = "/content/drive/MyDrive/prm_project/run-2/checkpoints/checkpoint-14628"

In [None]:
from datasets import ClassLabel

train_raw = load_from_disk("/content/drive/MyDrive/prm_project/run/data/train_parsed")
test_raw  = load_from_disk("/content/drive/MyDrive/prm_project/run/data/test_parsed")

print("Loaded train_raw:", train_raw)
print("Loaded test_raw:", test_raw)

test_raw = test_raw.cast_column("labels", ClassLabel(names=["incorrect", "correct"]))
test_split = test_raw.train_test_split(test_size=0.5, stratify_by_column="labels")
val_ds  = test_split["train"]
test_ds = test_split["test"]

print("val size:", len(val_ds), "test size:", len(test_ds))

Loaded train_raw: Dataset({
    features: ['text', 'labels'],
    num_rows: 563181
})
Loaded test_raw: Dataset({
    features: ['text', 'labels'],
    num_rows: 16153
})
val size: 8076 test size: 8077


In [None]:
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", trust_remote_code=True)

tok.padding_side = "left"

# Base model
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    dtype=DTYPE,
    device_map="cuda",
    trust_remote_code=True,
)
base.config.return_dict = True
base.config.use_cache = False

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
peft_model = PeftModel.from_pretrained(base, CKPT_DIR)

# PRM head from checkpoint
hs = getattr(peft_model.config, "hidden_size", None) or getattr(peft_model.config, "hidden_sizes", [None])[0]
prm_head = nn.Linear(hs, 1).to(device, dtype=DTYPE)

head_path = os.path.join(CKPT_DIR, "prm_head.bin")
assert os.path.exists(head_path), head_path
state_dict = torch.load(head_path, map_location="cuda")
prm_head.load_state_dict(state_dict, strict=False)
prm_head.eval()

print("Reloaded model + LoRA + PRM head from:", CKPT_DIR)

Reloaded model + LoRA + PRM head from: /content/drive/MyDrive/prm_project/run-2/checkpoints/checkpoint-14628


In [None]:
def tok_map(batch):
    enc = tok(batch["text"], truncation=True, max_length=MAX_SEQ_LENGTH)
    enc["labels"] = batch["labels"]
    return enc

val_tokenized  = val_ds.map(tok_map,  batched=True, remove_columns=val_ds.column_names)
test_tokenized = test_ds.map(tok_map, batched=True, remove_columns=test_ds.column_names)

data_collator = DataCollatorWithPadding(tok, padding=True, max_length=MAX_SEQ_LENGTH)
print("Example val batch keys:", val_tokenized[0].keys())

Map:   0%|          | 0/8076 [00:00<?, ? examples/s]

Map:   0%|          | 0/8077 [00:00<?, ? examples/s]

Example val batch keys: dict_keys(['labels', 'input_ids', 'attention_mask'])


In [None]:
from torch.utils.data import DataLoader

BATCH_EVAL = 4

val_loader = DataLoader(
    val_tokenized,
    batch_size=BATCH_EVAL,
    shuffle=False,
    collate_fn=data_collator,
)

test_loader = DataLoader(
    test_tokenized,
    batch_size=BATCH_EVAL,
    shuffle=False,
    collate_fn=data_collator,
)


In [None]:
from tqdm.auto import tqdm
import numpy as np
import torch

@torch.inference_mode()
def eval_loop(model, head, loader, name="eval"):
    model.eval()
    head.eval()

    all_probs = []
    all_labels = []
    total_loss = 0.0
    n_examples = 0

    pbar = tqdm(loader, desc=f"{name} eval", total=len(loader))

    for batch in pbar:
        labels = batch.pop("labels").to(torch.float32).to(device)      # (B,)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(
            **batch,
            output_hidden_states=True,
            use_cache=False,
        )
        last_hidden = outputs.hidden_states[-1]                         # (B, T, H)
        seq_len = last_hidden.size(1)
        h_last = last_hidden[:, seq_len - 1, :]                         # (B, H)

        logits = head(h_last.to(head.weight.dtype)).squeeze(-1)         # (B,)

        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels.to(logits.dtype), reduction="sum"
        )

        probs = torch.sigmoid(logits.to(torch.float32)).cpu().numpy()
        all_probs.append(probs)
        all_labels.append(labels.cpu().numpy().astype("float32"))

        total_loss += float(loss.item())
        n_examples += labels.size(0)

        avg_loss_so_far = total_loss / n_examples
        pbar.set_postfix(avg_loss=f"{avg_loss_so_far:.4f}")

        del outputs, logits, loss
        torch.cuda.empty_cache()

    probs = np.concatenate(all_probs, axis=0)
    labels = np.concatenate(all_labels, axis=0).astype("float32")

    y_hat = (probs >= 0.5).astype("float32")
    acc = (y_hat == labels).mean()
    avg_loss = total_loss / n_examples

    print(f"{name} | loss: {avg_loss:.4f} | accuracy: {acc:.4f}")
    return probs, labels


In [None]:
#probs_val,  labels_val  = eval_loop(peft_model, prm_head, val_loader,  name="VAL")
probs_test, labels_test = eval_loop(peft_model, prm_head, test_loader, name="TEST")

TEST eval:   0%|          | 0/2020 [00:00<?, ?it/s]



TEST | loss: 0.2528 | accuracy: 0.9454


In [None]:
import torch

@torch.inference_mode()
def score_steps(problem: str, partial: str, step: str) -> float:
    """Returns probability the step is correct, in [0,1]."""
    if partial.strip():
        text = (
            f"Problem: {problem}\n"
            f"Previous steps: {partial}\n"
            f"Current step: {step}\n"
            f"Is this step correct?"
        )
    else:
        text = (
            f"Problem: {problem}\n"
            f"Current step: {step}\n"
            f"Is this step correct?"
        )

    enc = tok(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding=False,
    ).to(device)

    out = peft_model(**enc, output_hidden_states=True, use_cache=False)
    last_hidden = out.hidden_states[-1]          # (1, T, H)
    seq_len = last_hidden.size(1)
    h_last = last_hidden[:, seq_len - 1, :]      # (1, H)

    logits = prm_head(h_last.to(prm_head.weight.dtype)).squeeze(-1)  # ()
    prob = torch.sigmoid(logits.to(torch.float32)).item()
    return prob

def demo_problem(problem, steps, partial=""):
    print("\n======================================")
    print("Problem:", problem)
    if partial:
        print("\nPrevious steps:", partial)
    print("\nScored steps:\n")
    scored = []
    for s in steps:
        p = score_steps(problem, partial, s)
        scored.append((p, s))
    for p, s in sorted(scored, key=lambda x: -x[0]):
        print(f"{p:0.3f} :: {s}")


In [None]:
# 1) Simple algebra
problem1 = "Solve the equation 3x - 5 = 16."
steps1 = [
    "Add 5 to both sides: 3x = 21.",              # correct
    "Subtract 5 from both sides: 3x = 11.",       # wrong
    "Divide both sides by 3: x = 7.",             # correct (if after first step)
    "Multiply both sides by 3: x = 48.",          # wrong
]
demo_problem(problem1, steps1)

# 2) With previous steps filled in
problem2 = "Compute the product 24 × 17."
partial2 = (
    "Break 17 into 10 + 7. "
    "Compute 24 × 10 = 240. "
    "Compute 24 × 7 = 168."
)
steps2 = [
    "Add the partial products: 240 + 168 = 408.",    # correct
    "Add the partial products: 240 + 168 = 398.",    # wrong
    "answer 17.",    # nonsense
]
demo_problem(problem2, steps2, partial=partial2)

# 3) Commonsense
problem3 = (
    "You put a glass of water in the freezer at 8 pm. "
    "The freezer is working normally. What happens by 9 pm?"
)
steps3 = [
    "The water will likely have started to freeze or be completely frozen.",   # good
    "The water will have boiled away due to the heat.",                        # bad
    "The glass instantly explodes because of gravity.",                        # bad
]
demo_problem(problem3, steps3)

# 4) Toy coding reasoning
problem4 = "You want a Python function that returns the sum of numbers in a list."
steps4 = [
    "Define a function that iterates over the list and adds each item to a running total.",  # good
    "Define a function that multiplies all the numbers instead of adding them.",             # bad
    "Define a function that always returns 0 regardless of the input list.",                 # bad
]
demo_problem(problem4, steps4)



Problem: Solve the equation 3x - 5 = 16.

Scored steps:

0.997 :: Add 5 to both sides: 3x = 21.
0.992 :: Subtract 5 from both sides: 3x = 11.
0.967 :: Divide both sides by 3: x = 7.
0.000 :: Multiply both sides by 3: x = 48.

Problem: Compute the product 24 × 17.

Previous steps: Break 17 into 10 + 7. Compute 24 × 10 = 240. Compute 24 × 7 = 168.

Scored steps:

0.995 :: Add the partial products: 240 + 168 = 408.
0.881 :: answer 17.
0.828 :: Add the partial products: 240 + 168 = 398.

Problem: You put a glass of water in the freezer at 8 pm. The freezer is working normally. What happens by 9 pm?

Scored steps:

0.998 :: The water will likely have started to freeze or be completely frozen.
0.830 :: The glass instantly explodes because of gravity.
0.694 :: The water will have boiled away due to the heat.

Problem: You want a Python function that returns the sum of numbers in a list.

Scored steps:

0.994 :: Define a function that iterates over the list and adds each item to a running tot

In [None]:
import numpy as np

pos_scores = probs_test[labels_test == 1]
neg_scores = probs_test[labels_test == 0]

print("Pos mean:", pos_scores.mean(), "  std:", pos_scores.std())
print("Neg mean:", neg_scores.mean(), "  std:", neg_scores.std())
print("Example pos:", pos_scores[:10])
print("Example neg:", neg_scores[:10])


Pos mean: 0.9397575   std: 0.18630156
Neg mean: 0.4421963   std: 0.41498184
Example pos: [0.9840936  0.3233801  0.83761996 0.0726367  0.97068775 0.99629277
 0.89912134 0.81757444 0.9921841  0.99444515]
Example neg: [0.9933072  0.00609756 0.98934746 0.03514485 0.9845754  0.99242276
 0.4144248  0.13939638 0.00116951 0.5075068 ]


In [None]:
from huggingface_hub import login, create_repo, upload_folder


EXPORT_DIR = "/content/drive/MyDrive/prm_project/run-2/checkpoints/checkpoint-14628"  # <-- change to your best checkpoint

# 2) Hugging Face repo id: "username/model-name"
HF_REPO_ID = "devangb4/prm-qwen3-8b-bf16-full"      # <-- change this


login()

create_repo(HF_REPO_ID, repo_type="model", private=False, exist_ok=True)

# Upload everything in EXPORT_DIR
upload_folder(
    folder_path=EXPORT_DIR,
    repo_id=HF_REPO_ID,
    repo_type="model",
    commit_message="Upload PRM LoRA adapter + head + tokenizer from checkpoint",
)

print("Uploaded to:", HF_REPO_ID)
print("   From local folder:", EXPORT_DIR)



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...kpoint-14628/prm_head.bin: 100%|##########| 10.1kB / 10.1kB            

  ...point-14628/rng_state.pth: 100%|##########| 14.6kB / 14.6kB            

  ...kpoint-14628/scheduler.pt: 100%|##########| 1.47kB / 1.47kB            

  ...oint-14628/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

  ...kpoint-14628/optimizer.pt:  27%|##7       | 33.5MB /  123MB            

  ...adapter_model.safetensors:  55%|#####4    | 33.6MB / 61.4MB            

Uploaded to: devangb4/prm-qwen3-8b-bf16-full
   From local folder: /content/drive/MyDrive/prm_project/run-2/checkpoints/checkpoint-14628


In [None]:
HF_REPO_ID = "devangb4/prm-qwen3-8b-bf16-6k"
BASE_MODEL_NAME = "Qwen/Qwen3-8B"
MAX_SEQ_LENGTH = 384
from huggingface_hub import snapshot_download

# Download the model snapshot locally (gets adapter, head, tokenizer, etc.)
local_dir = snapshot_download(HF_REPO_ID, repo_type="model")
print("Local snapshot dir:", local_dir)
