In [None]:
from google.colab import drive
drive.mount('/content/drive')
from huggingface_hub import notebook_login
notebook_login()

Mounted at /content/drive


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install wget tqdm datasets&> /dev/null

In [None]:
# PROJECT_PATH = "/content/drive/MyDrive/prm_project/run"
PROJECT_PATH = "/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2"
BASE_MODEL_NAME = "Qwen/Qwen3-8B"
DATASET_URL = "https://github.com/openai/prm800k/raw/main/prm800k/data"
TRAIN_FRACTION = 1.0 #0.1 for playing around

In [None]:
import wget
import os

dataset_files = {}

files_to_download = {
    'phase1_train': 'phase1_train.jsonl',
    'phase1_test': 'phase1_test.jsonl',
    'phase2_train': 'phase2_train.jsonl',
    'phase2_test': 'phase2_test.jsonl'
}

for key, filename in files_to_download.items():
    url = f"{DATASET_URL}/{filename}"
    output_path = f"{PROJECT_PATH}/data/{filename}"

    if not os.path.exists(output_path):
        print(f"  Downloading {filename}...")
        wget.download(url, output_path)
        print(f"  Downloaded {filename}")
    else:
        print(f"  {filename} already exists")

    dataset_files[key] = output_path

  phase1_train.jsonl already exists
  phase1_test.jsonl already exists
  phase2_train.jsonl already exists
  phase2_test.jsonl already exists


In [None]:
import json
with open(dataset_files["phase2_train"], "r") as f:
  lines = f.readlines()
lines = lines[:1]
for line in lines:
    data = json.loads(line)
    print("Keys are: ",data.keys())
    print("question is: ",data["question"]["problem"])
    steps = data["label"]["steps"]

    for step_data in steps:
      if step_data["chosen_completion"] is not None:
        chosen_completion = step_data["completions"][step_data["chosen_completion"]]
        print(chosen_completion)


Keys are:  dict_keys(['labeler', 'timestamp', 'generation', 'is_quality_control_question', 'is_initial_screening_question', 'question', 'label'])
question is:  The first four terms in an arithmetic sequence are $x+y$, $x-y$, $xy$, and $x/y$, in that order. What is the fifth term? Express your answer as a common fraction.
{'text': 'To find the fifth term, I need to identify the common difference of the arithmetic sequence and add it to the fourth term.', 'rating': 1, 'flagged': None}
{'text': 'The common difference is the same for any consecutive pair of terms, so I can use any of them to find it.', 'rating': 1, 'flagged': None}
{'text': 'For example, using the first and second terms, I can write $x-y = x+y + d$, where $d$ is the common difference.', 'rating': 1, 'flagged': None}
{'text': 'Solving for $d$, I get $d = -2y$.', 'rating': 1, 'flagged': None}
{'text': 'Using another pair of terms, such as the second and third, I can check if this value of $d$ is consistent.', 'rating': 1, 'f

In [None]:
from tqdm.auto import tqdm
from collections import Counter
import json

def analyze_dataset(filepath):
  label_counts = Counter()
  total_steps = 0
  with open(filepath, "r") as f:
    lines = f.readlines()
  for line in tqdm(lines, desc="Analyzing"):
    try:
      data = json.loads(line)
      steps = data["label"]["steps"]
      for step_data in steps:
        if step_data["chosen_completion"] is not None:
          completion = step_data["completions"][step_data["chosen_completion"]]
        elif step_data["human_completion"]:
                    completion = {
                        "text": step_data["human_completion"]["text"],
                        "rating": step_data["human_completion"]["rating"],
                    }
        else:
            continue
        rating = completion["rating"]
        if rating != 0:
          y = 1 if rating == 1 else 0
          label_counts[y] += 1
          total_steps += 1
    except Exception:
      continue
  pos, neg = label_counts[1], label_counts[0]
  ratio = (pos / neg) if neg else 0
  print(f"  Total labeled steps: {total_steps:,}")
  print(f"  Positive: {pos:,} ({(pos/total_steps)*100:.1f}%)")
  print(f"  Negative: {neg:,} ({(neg/total_steps)*100:.1f}%)")
  print(f"  Imbalance ratio: {ratio:.1f}:1")
  return label_counts, total_steps

_ = analyze_dataset(dataset_files["phase2_train"])
_ = analyze_dataset(dataset_files["phase2_test"])

Analyzing:   0%|          | 0/97782 [00:00<?, ?it/s]

  Total labeled steps: 563,181
  Positive: 528,078 (93.8%)
  Negative: 35,103 (6.2%)
  Imbalance ratio: 15.0:1


Analyzing:   0%|          | 0/2762 [00:00<?, ?it/s]

  Total labeled steps: 16,153
  Positive: 15,226 (94.3%)
  Negative: 927 (5.7%)
  Imbalance ratio: 16.4:1


In [None]:
from datasets import Dataset

def parse_prm800k(filepath):
    texts, labels = [], []

    with open(filepath, "r") as f:
        lines = f.readlines()

    for line in tqdm(lines, desc="Parsing"):
        try:
            data = json.loads(line)
            problem = data["question"]["problem"]
            steps = data["label"]["steps"]
            partial = ""  # running concatenation of earlier steps

            for step_data in steps:
                if step_data["chosen_completion"] is not None:
                    completion = step_data["completions"][step_data["chosen_completion"]]
                elif step_data["human_completion"]:
                    completion = {
                        "text": step_data["human_completion"]["text"],
                        "rating": step_data["human_completion"]["rating"],
                    }
                else:
                    continue

                step_text = completion["text"]
                rating = completion["rating"]

                # skip neutral
                if rating == 0:
                    partial += step_text + " "
                    continue

                if partial.strip():
                    text = f"""Problem: {problem}
Previous steps: {partial.strip()}
Current step: {step_text}
Is this step correct?"""
                else:
                    text = f"""Problem: {problem}
Current step: {step_text}
Is this step correct?"""

                y = 1 if rating == 1 else 0
                texts.append(text)
                labels.append(y)

                partial += step_text + " "
        except Exception:
            continue

    ds = Dataset.from_dict({"text": texts, "labels": labels})
    return ds

In [None]:
train_raw = parse_prm800k(dataset_files["phase2_train"])
test_raw = parse_prm800k(dataset_files['phase2_test'])

train_raw.save_to_disk(f"{PROJECT_PATH}/data/train_parsed")
test_raw.save_to_disk(f"{PROJECT_PATH}/data/test_parsed")

Parsing:   0%|          | 0/97782 [00:00<?, ?it/s]

Parsing:   0%|          | 0/2762 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/563181 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/16153 [00:00<?, ? examples/s]

In [None]:
from datasets import concatenate_datasets
def create_optimal_training_set(dataset, target_positive_ratio=0.7):
  pos = dataset.filter(lambda x: x["labels"] == 1)
  neg = dataset.filter(lambda x: x["labels"] == 0)
  print(f"Original distribution -> pos: {len(pos):,}, neg: {len(neg):,} (ratio {len(pos)/len(neg):.1f}:1)")
  n_neg = len(neg)
  n_pos_needed = int(n_neg * target_positive_ratio / (1 - target_positive_ratio))
  n_pos = min(len(pos), n_pos_needed)
  pos_s = pos.shuffle(seed=42).select(range(n_pos))
  balanced = concatenate_datasets([pos_s, neg]).shuffle()



  actual_ratio = sum(balanced["labels"]) / len(balanced)
  print(f"Balanced set -> total: {len(balanced):,}, pos%: {actual_ratio*100:.1f}%")
  return balanced

In [None]:
from datasets import ClassLabel

train_ds = create_optimal_training_set(train_raw, target_positive_ratio=0.7)

# ##Remove during actual training -- BEGIN
# TRAIN_FRACTION = 0.1
# if TRAIN_FRACTION < 1.0:
#       n = max(1, int(len(train_ds) * TRAIN_FRACTION))
#       train_ds = train_ds.select(range(n))
# ##Remove during actual training -- END


test_raw = test_raw.cast_column("labels", ClassLabel(names=["incorrect", "correct"]))
test_split = test_raw.train_test_split(test_size=0.5, stratify_by_column="labels", seed=42)
val_ds  = test_split["train"]
test_ds = test_split["test"]

print("Sizes -> train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))

Filter:   0%|          | 0/563181 [00:00<?, ? examples/s]

Filter:   0%|          | 0/563181 [00:00<?, ? examples/s]

Original distribution -> pos: 528,078, neg: 35,103 (ratio 15.0:1)
Balanced set -> total: 117,009, pos%: 70.0%


Casting the dataset:   0%|          | 0/16153 [00:00<?, ? examples/s]

Sizes -> train: 117009 val: 8076 test: 8077


In [None]:
MAX_SEQ_LENGTH = 384

from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, use_fast=True, trust_remote_code=True)
tok.padding_side = "left"

def tok_map(batch):
  enc = tok(batch["text"], truncation=True, max_length=MAX_SEQ_LENGTH)
  enc["labels"] = batch["labels"]
  return enc

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorWithPadding

train_tokenized = train_ds.map(tok_map, batched=True, remove_columns = train_ds.column_names)
val_tokenized = val_ds.map(tok_map, batched=True, remove_columns=val_ds.column_names)

data_collator = DataCollatorWithPadding(tok, padding=True, max_length=MAX_SEQ_LENGTH)
print("Example:", {k: type(v) for k,v in train_tokenized[0].items()})

Map:   0%|          | 0/117009 [00:00<?, ? examples/s]

Map:   0%|          | 0/8076 [00:00<?, ? examples/s]

Example: {'labels': <class 'int'>, 'input_ids': <class 'list'>, 'attention_mask': <class 'list'>}


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding)
device = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16

# Model: base -> LoRA -> PRM head
# base = AutoModelForCausalLM.from_pretrained(
#     BASE_MODEL_NAME,
#     dtype=DTYPE,
#     device_map="auto", #Set to cuda when training
#     trust_remote_code=True
# )
# base.config.return_dict=True
# base.config.use_cache=True

# lcfg = LoraConfig(
#     r = 16, #Rank of hidden matrices i.e [n,r] x [r,n]
#     lora_alpha=32, #How much we want to change the base LM
#     target_modules=["q_proj","k_proj","v_proj","o_proj"], #Target the query, key, value, output (all) layers for best results as per the LoRA paper
#     lora_dropout=0.05, #Preven overfitting
#     bias="none", #Bias shouldn't be updated in LORA
#     task_type="CAUSAL_LM"
# )

# peft_model = get_peft_model(base, lcfg)
# peft_model.print_trainable_parameters()

# hidden_size = getattr(peft_model.config, "hidden_size", None) or getattr(peft_model.config, "hidden_sizes", [None])[0]
# assert hidden_size is not None
# prm_head = nn.Linear(hidden_size, 1).to(device, dtype=DTYPE)

In [None]:
class PRMTrainer(Trainer):
    def __init__(self, prm_head, **kwargs):
        super().__init__(**kwargs)
        self.prm_head = prm_head

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
      labels = inputs.pop("labels").to(torch.float32)
      attention_mask = inputs["attention_mask"]
      outputs = model(
          **inputs,
          output_hidden_states=True,
          use_cache=False,
      )
      last_hidden = outputs.hidden_states[-1]
      seq_len = last_hidden.size(1)
      h_last = last_hidden[:, seq_len - 1, :]
      logits = self.prm_head(h_last.to(self.prm_head.weight.dtype)).squeeze(-1)
      labels = labels.to(logits.dtype)
      loss = F.binary_cross_entropy_with_logits(logits, labels)
      if return_outputs:
          outputs.logits = logits
          return loss, outputs
      else:
          return loss

    def save_model(self, output_dir=None, _internal_call=False):
        # Always save adapter + PRM head + tokenizer + metadata
        output_dir = output_dir or self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.model.save_pretrained(output_dir)  # saves PEFT adapter
        # Save PRM head
        torch.save(self.prm_head.state_dict(), os.path.join(output_dir, "prm_head.bin"))
        with open(os.path.join(output_dir, "prm_head_config.json"), "w") as f:
            json.dump({"hidden_size": int(self.prm_head.in_features)}, f, indent=2)
        # Save tokenizer if available
        try:
            self.tokenizer.save_pretrained(output_dir)
        except Exception:
            pass
        # Extra metadata for easy reloads
        with open(os.path.join(output_dir, "export_metadata.json"), "w") as f:
            json.dump({
                "base_model": BASE_MODEL_NAME,
                "dtype": "bfloat16" if DTYPE==torch.bfloat16 else "float16",
                "lora": {"r": 16, "alpha": 32, "dropout": 0.05, "targets": ["q_proj","k_proj","v_proj","o_proj"]},
                "max_seq_length": MAX_SEQ_LENGTH
            }, f, indent=2)
        print(f"Saved adapter + head + tokenizer to {output_dir}")

In [None]:
args = TrainingArguments(
    output_dir=f"{PROJECT_PATH}/checkpoints",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=8,
    per_device_eval_batch_size=1, #reducing batch size to avoid outofmemory error
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=2000,
    save_strategy="steps",   # we'll export explicitly
    save_steps=2000,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=True,
    fp16=False,
    report_to="none",
    dataloader_num_workers=2,
)

In [None]:
trainer = PRMTrainer(
    model=peft_model,
    prm_head=prm_head,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tok,
    data_collator=data_collator,
)
train_result = trainer.train()
print("Training complete.", train_result.metrics)

  super().__init__(**kwargs)
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss
2000,1.0619,0.323211
4000,0.9154,0.246664
6000,1.0601,0.201161
8000,0.6949,0.224898
10000,0.6476,0.25096
12000,0.7474,0.263885
14000,0.5836,0.255581


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-2000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-4000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-8000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-10000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-12000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-14000


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-14628
Training complete. {'train_runtime': 24094.7025, 'train_samples_per_second': 9.712, 'train_steps_per_second': 0.607, 'total_flos': 3.8538290358499676e+18, 'train_loss': 0.8752943780991365, 'epoch': 2.0}


#Save model

In [None]:
## IMPORTANT: Run Only if training is stopped mid run
from transformers import AutoModelForCausalLM
from peft import PeftModel

CKPT_DIR = f"{PROJECT_PATH}/checkpoints_main/checkpoint-4000"  # pick the path of checkpoint we have decided

base_reload = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=True,
)
peft_reload = PeftModel.from_pretrained(base_reload, CKPT_DIR)

trainer = PRMTrainer(
    model=peft_reload,
    prm_head=prm_head,   # or reload head separately if you saved it too
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tok,
    data_collator=data_collator,
)

EXPORT_DIR = f"{PROJECT_PATH}/final_export_from_checkpoint"
os.makedirs(EXPORT_DIR, exist_ok=True)
trainer.save_model(EXPORT_DIR)


In [None]:
EXPORT_DIR = f"{PROJECT_PATH}/checkpoints/checkpoint-6000"
DTYPE = torch.bfloat16

tok2 = AutoTokenizer.from_pretrained(EXPORT_DIR, local_files_only=True)
base2 = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME, dtype=DTYPE, device_map="auto", trust_remote_code=True)
peft2 = PeftModel.from_pretrained(base2, EXPORT_DIR, local_files_only=True)


# Load PRM head
hs = getattr(peft2.config, "hidden_size", None) or getattr(peft2.config, "hidden_sizes", [None])[0]
prm_head2 = nn.Linear(hs, 1).to(device, dtype=DTYPE)
prm_head2.load_state_dict(torch.load(os.path.join(EXPORT_DIR, "prm_head.bin"), map_location="cuda"))

# Reduce evaluation batch size to avoid OutOfMemoryError
args.per_device_eval_batch_size = 1

trainer = PRMTrainer(
    model=peft2,
    prm_head=prm_head2,   # or reload head separately if you saved it too
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tok,
    data_collator=data_collator,
)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/checkpoints/checkpoint-6000'. Use `repo_type` argument if needed.

In [None]:
##Else Run this if trainer completed succesfully
EXPORT_DIR = f"{PROJECT_PATH}/final_export"
os.makedirs(EXPORT_DIR, exist_ok=True)
# HF_REPO_ID = "your-username/prm-qwen3-8b-bf16-final"  # change me before pushing
HF_REPO_ID = "Kaubitech/prm-qwen3-8b-bf16-final"  # change me before pushing

trainer.save_model(EXPORT_DIR)
print("Files in EXPORT_DIR:")
!ls -lah "$EXPORT_DIR"

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Saved adapter + head + tokenizer to /content/drive/MyDrive/UC DAVIS/ECS289A-LLM/prm_project/run-2/final_export
Files in EXPORT_DIR:
total 74M
-rw------- 1 root root  876 Nov 14 00:22 adapter_config.json
-rw------- 1 root root  59M Nov 14 00:22 adapter_model.safetensors
-rw------- 1 root root  707 Nov 14 00:22 added_tokens.json
-rw------- 1 root root 4.1K Nov 14 00:22 chat_template.jinja
-rw------- 1 root root  237 Nov 14 00:22 export_metadata.json
-rw------- 1 root root 1.6M Nov 14 00:22 merges.txt
-rw------- 1 root root 9.9K Nov 14 00:22 prm_head.bin
-rw------- 1 root root   25 Nov 14 00:22 prm_head_config.json
-rw------- 1 root root 5.1K Nov 14 00:22 README.md
-rw------- 1 root root  613 Nov 14 00:22 special_tokens_map.json
-rw------- 1 root root 5.3K Nov 14 00:22 tokenizer_config.json
-rw------- 1 root root  11M Nov 14 00:22 tokenizer.json
-rw------- 1 root root 2.7M Nov 14 00:22 vocab.json


In [None]:
# 12) (Optional) Push to Hugging Face (adapter + head + tokenizer + metadata)
from huggingface_hub import HfApi, create_repo, upload_folder, snapshot_download, login

DO_PUSH = False  # set True to push
if DO_PUSH:
    login()
    try:
        create_repo(HF_REPO_ID, repo_type="model", private=False, exist_ok=True)
    except Exception as e:
        print("Repo may already exist:", e)
    upload_folder(
        folder_path=EXPORT_DIR,
        repo_id=HF_REPO_ID,
        repo_type="model",
        commit_message="Add LoRA adapter + PRM head + tokenizer + metadata",
    )
    print("Uploaded to:", HF_REPO_ID)
else:
    print("Skipping push (set DO_PUSH=True to upload).")

Skipping push (set DO_PUSH=True to upload).


In [None]:
def free_gpu_memory():
    import torch, gc
    del trainer.model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

free_gpu_memory()

In [None]:
!nvidia-smi

Fri Nov 14 01:43:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   33C    P0             57W /  400W |   71907MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

#Evaluation

In [None]:
test_tokenized = test_ds.map(tok_map,batched=True,remove_columns=test_ds.column_names)

Map:   0%|          | 0/8077 [00:00<?, ? examples/s]

In [None]:
import numpy as np

def eval_on(dataset_tokenized, name="eval"):
    # trainer.predict returns (predictions, label_ids, metrics)
    preds = trainer.predict(dataset_tokenized)
    logits = preds.predictions.squeeze(-1)
    labels = preds.label_ids.astype("float32")

    # convert logits -> probabilities
    probs = 1.0 / (1.0 + np.exp(-logits))
    y_hat = (probs >= 0.5).astype("float32")

    acc = (y_hat == labels).mean()
    print(f"{name}  |  loss: {preds.metrics['test_loss']:.4f}  |  accuracy: {acc:.4f}")
    return {
        "loss": float(preds.metrics["test_loss"]),
        "accuracy": float(acc),
        "probs": probs,
        "labels": labels,
    }

test_stats = eval_on(test_tokenized, name="TEST")

AttributeError: `AcceleratorState` object has no attribute `distributed_type`. This happens if `AcceleratorState._reset_state()` was called and an `Accelerator` or `PartialState` was not reinitialized.

In [None]:
import numpy as np

def eval_on(dataset_tokenized, name="eval"):
    # trainer.predict returns (predictions, label_ids, metrics)
    preds = trainer.predict(dataset_tokenized)
    logits = preds.predictions.squeeze(-1)      # (N,)
    labels = preds.label_ids.astype("float32")  # (N,)

    # convert logits -> probabilities
    probs = 1.0 / (1.0 + np.exp(-logits))
    y_hat = (probs >= 0.5).astype("float32")

    acc = (y_hat == labels).mean()
    print(f"{name}  |  loss: {preds.metrics['test_loss']:.4f}  |  accuracy: {acc:.4f}")
    return {
        "loss": float(preds.metrics["test_loss"]),
        "accuracy": float(acc),
        "probs": probs,
        "labels": labels,
    }

val_stats  = eval_on(val_tokenized,  name="VAL")
test_stats = eval_on(test_tokenized, name="TEST")


#Reload Model from local and do small tests

##Reload Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding)
from peft import PeftModel
import torch
import os
import torch.nn as nn
import torch.nn.functional as F
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LENGTH = 384

# 13) Inference loader from EXPORT_DIR
EXPORT_DIR = f"{PROJECT_PATH}/final_export"
DTYPE = torch.bfloat16

tok2 = AutoTokenizer.from_pretrained(EXPORT_DIR, trust_remote_code=True)
base2 = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME, dtype=DTYPE, device_map="auto", trust_remote_code=True)
peft2 = PeftModel.from_pretrained(base2, EXPORT_DIR)

# Load PRM head
hs = getattr(peft2.config, "hidden_size", None) or getattr(peft2.config, "hidden_sizes", [None])[0]
prm_head2 = nn.Linear(hs, 1).to(device, dtype=DTYPE)
prm_head2.load_state_dict(torch.load(os.path.join(EXPORT_DIR, "prm_head.bin"), map_location="cuda"))
prm_head2.eval()



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Linear(in_features=4096, out_features=1, bias=True)

##Do Eval

In [None]:

@torch.inference_mode()
def score_steps(prompt, partial, step):
    if partial.strip():
        text = f"Problem: {prompt}\nPrevious steps: {partial}\nCurrent step: {step}\nIs this step correct?"
    else:
        text = f"Problem: {prompt}\nCurrent step: {step}\nIs this step correct?"
    enc = tok2(text, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH, padding=True).to(device)
    out = peft2(**enc, output_hidden_states=True, use_cache=False)
    last = out.hidden_states[-1]
    idx = enc["attention_mask"].sum(dim=1) - 1
    h = last[torch.arange(last.size(0), device=last.device), idx]
    logits = prm_head2(h.to(prm_head2.weight.dtype)).squeeze(-1)
    return torch.sigmoid(logits.float()).item()

print("Smoke test:", score_steps("2x + 3 = 7", "", "Subtract 3 from both sides: 2x = 4"))

Smoke test: 0.9935117363929749


In [None]:
def demo_problem(problem, steps, partial=""):
    print(f"Problem: {problem}\n")
    scored = []
    for s in steps:
        p = score_steps(problem, partial, s)
        scored.append((p, s))
    # sort by score descending
    for p, s in sorted(scored, key=lambda x: -x[0]):
        print(f"{p:0.3f}  ::  {s}")
    print("-" * 60)

# 1) Simple equation
problem1 = "Solve 2x + 3 = 7."
steps1 = [
    "Subtract 3 from both sides to get 2x = 4.",              # good
    "Add 3 to both sides to get 2x = 10.",                    # bad
    "Divide both sides by 2 to get x = 2.",                   # good
    "Multiply both sides by 2 to get 4x = 14.",               # bad
]

demo_problem(problem1, steps1)

# 2) Multi-step with 'previous steps' filled in
problem2 = "Compute the product 23 × 17."
partial2 = "First, compute 23 × 10 = 230. Then, compute 23 × 7 = 161."
steps2 = [
    "Add the partial products: 230 + 161 = 391.",             # good
    "Add the partial products: 230 + 161 = 381.",             # bad
]

demo_problem(problem2, steps2, partial=partial2)

# 3) Intentionally nonsense
problem3 = "Evaluate 5 + 8."
steps3 = [
    "5 + 8 = 13.",                                            # good
    "5 + 8 = 58 because we concatenate the digits.",          # bad
    "5 + 8 = 20.",                                            # bad
]

demo_problem(problem3, steps3)


Problem: Solve 2x + 3 = 7.

0.995  ::  Subtract 3 from both sides to get 2x = 4.
0.992  ::  Add 3 to both sides to get 2x = 10.
0.991  ::  Divide both sides by 2 to get x = 2.
0.991  ::  Multiply both sides by 2 to get 4x = 14.
------------------------------------------------------------
Problem: Compute the product 23 × 17.

0.987  ::  Add the partial products: 230 + 161 = 391.
0.943  ::  Add the partial products: 230 + 161 = 381.
------------------------------------------------------------
Problem: Evaluate 5 + 8.

0.991  ::  5 + 8 = 13.
0.587  ::  5 + 8 = 58 because we concatenate the digits.
0.163  ::  5 + 8 = 20.
------------------------------------------------------------


##Eval: Actual testing on hold dataset