In [1]:
pip install unsloth  trl

Collecting unsloth
  Downloading unsloth-2025.9.6-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2025.9.7 (from unsloth)
  Downloading unsloth_zoo-2025.9.8-py3-none-any.whl.metadata (31 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub>=0.34.0 (from unsloth)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
INFO: pip is looking at multiple versions of trl to determine which version is compatible with other requirements. This cou

In [6]:
!git clone https://github.com/domager73/base.git

fatal: destination path 'base' already exists and is not an empty directory.


In [7]:
import sys
sys.path.append('/kaggle/working/base')

In [8]:
import torch
from torch.utils.data import IterableDataset, Dataset
from datasets import Dataset as HFDataset
import random
from typing import Optional, List
import numpy as np
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
import matplotlib.pyplot as plt
import json
from huggingface_hub import HfApi
from point_set import PointSet
from tqdm import tqdm

In [9]:
class PointSetIterableDataset(IterableDataset):
    def __init__(self, game, num_samples=1000, difficulty=None):
        self.game = game
        self.num_samples = num_samples
        self.difficulty = difficulty

    def __iter__(self):
        for _ in range(self.num_samples):
            game_data_list = self.game.generate(
                num_of_questions=1,
                difficulty=self.difficulty if self.difficulty else random.randint(1, 10)
            )

            if game_data_list:
                data = game_data_list[0]
                yield {
                    "question": data.question,
                    "answer": data.answer,
                    "difficulty": data.difficulty,
                    "metadata": data.metadata
                }

In [10]:
def create_test_datasets(game, difficulties=[1, 3, 5, 7, 10], samples_per_difficulty=100):
    test_datasets = {}

    for difficulty in difficulties:
        print(f"Creating test dataset for difficulty {difficulty}...")
        game_data_list = game.generate(
            num_of_questions=samples_per_difficulty,
            difficulty=difficulty
        )

        dataset_data = []
        for data in game_data_list:
            dataset_data.append({
                "question": data.question,
                "answer": data.answer,
                "difficulty": data.difficulty,
                "metadata": data.metadata
            })

        test_datasets[f"difficulty_{difficulty}"] = HFDataset.from_list(dataset_data)

    return test_datasets


def format_prompt(question):
    return f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""


def correctness_reward_func(game, examples, predictions, **kwargs):
    rewards = []
    for i, (example, prediction) in enumerate(zip(examples, predictions)):
        data = type('Data', (), {
            'question': example['question'],
            'answer': example['answer'],
            'difficulty': example['difficulty'],
            'metadata': example['metadata']
        })()

        is_correct = game.verify(data, prediction)
        rewards.append(1.0 if is_correct else 0)

    return rewards

In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [12]:
import unsloth
from unsloth import FastLanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from point_set import PointSet
from point_set_verifier import PointSetVerifier
import torch

class PointSetGRPODataset(torch.utils.data.Dataset):
    def __init__(self, game, num_samples=5000, difficulties=[1, 3, 5, 7, 10]):
        self.game = game
        self.num_samples = num_samples
        self.data = []


        for i in tqdm(difficulties):
          for _ in range(num_samples):
              data_point = game.generate(num_of_questions=1,difficulty = i)
              prompt = format_prompt(data_point[0].question)
              self.data.append({
                  "prompt": prompt,
                  "question": data_point[0].question,
                  "answer": data_point[0].answer,
                  "difficulty": data_point[0].difficulty,
                  "metadata": data_point[0].metadata
              })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def preprocess_function(examples, tokenizer):
    model_inputs = tokenizer(
        examples["prompt"],
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    return model_inputs

def format_prompt(question):
    SYSTEM_PROMPT = """You are an expert in point set topology. Answer ONLY with one word: 'internal', 'boundary', or 'external'. Do not include any other text, explanations, or formatting."""
    return f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""


game = PointSet()

print('Genering train data')
train_dataset = PointSetGRPODataset(game, num_samples=1200, difficulties = [3])
print('Genering test data')
test_datasets = create_test_datasets(game,difficulties=[3])

model_name = "Qwen/Qwen2.5-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Genering train data


100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


Genering test data
Creating test dataset for difficulty 3...
==((====))==  Unsloth 2025.9.6: Fast Qwen2 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.9.6 patched 36 layers with 36 QKV layers, 36 O layers and 0 MLP layers.


In [14]:
training_args = GRPOConfig(
    output_dir="./point_set_agent",
    per_device_train_batch_size=3,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    optim="adamw_torch",
    max_steps=100,
    logging_steps=1,
    save_steps=50,
    eval_steps=25,
    warmup_steps=10,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 3 to the `num_generations` of 8


In [15]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


log_file =  f"training_log.json"
def grpo_reward_func(prompts, completions, **kwargs):
    rewards = []
    verifier = PointSetVerifier()

    step_logs = []

    prompt_to_data = {}
    for item in train_dataset.data:
        prompt_to_data[item["prompt"]] = item

    for i, (prompt, completion) in enumerate(zip(prompts, completions)):
        if prompt in prompt_to_data:
            data_item = prompt_to_data[prompt]

            class Data:
                def __init__(self, question, answer, difficulty, metadata):
                    self.question = question
                    self.answer = answer
                    self.difficulty = difficulty
                    self.metadata = metadata

            data = Data(
                question=data_item['question'],
                answer=data_item['answer'],
                difficulty=data_item['difficulty'],
                metadata=data_item['metadata']
            )

            is_correct = verifier.verify(data, completion)
            reward = 1.0 if is_correct else 0
            rewards.append(reward)

            step_log = {
                "completion": completion,
                "expected_answer": data_item['answer'],
                "is_correct": is_correct,
                "reward": float(reward),
                "difficulty": data_item['difficulty'],
                "question": data_item['question'],
                "answer": data_item['answer'],
            }
            step_logs.append(step_log)

        else:
            rewards.append(0.0)
            step_log = {
                "completion": completion,
                "error": "prompt_not_found_in_dataset",
                "reward": 0.0
            }
            step_logs.append(step_log)

    with open(log_file, 'a', encoding='utf-8') as f:
        for log_entry in step_logs:
            f.write(json.dumps(log_entry, ensure_ascii=False) + '\n')

    return torch.tensor(rewards, dtype=torch.float32)

trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    reward_funcs=[grpo_reward_func],
    preprocess_logits_for_metrics=lambda logits, labels: logits.argmax(dim=-1),
)

print("Starting training...")
trainer.train()
print("Training completed successfully!")

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,200 | Num Epochs = 2 | Total steps = 100
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 8 x 1) = 128
 "-____-"     Trainable parameters = 3,686,400 of 3,089,625,088 (0.12% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / grpo_reward_func / mean,rewards / grpo_reward_func / std
1,0.0,0.210938,0.212007,2.0,2.0,2.0,0.0,2.0,2.0,2.0,4.7e-05,0.210938,0.409577
2,0.0,0.203125,0.197276,2.0,2.0,2.0,0.0,2.0,2.0,2.0,3.8e-05,0.203125,0.403906
3,0.0,0.304688,0.252244,2.0,2.0,2.0,0.0,2.0,2.0,2.0,5.6e-05,0.304688,0.462084
4,0.0,0.429688,0.255145,2.0,2.0,2.0,0.0,2.0,2.0,2.0,4.2e-05,0.429688,0.496977
5,0.0,0.210938,0.317484,2.0,2.0,2.0,0.0,2.0,2.0,2.0,4.5e-05,0.210938,0.409577
6,0.0,0.289062,0.373746,2.0,2.0,2.0,0.0,2.0,2.0,2.0,2e-05,0.289062,0.455108
7,0.0,0.265625,0.342455,2.0,2.0,2.0,0.0,2.0,2.0,2.0,3.8e-05,0.265625,0.443401
8,0.0,0.25,0.241999,2.0,2.0,2.0,0.0,2.0,2.0,2.0,3.4e-05,0.25,0.434714
9,0.0,0.328125,0.317479,2.0,2.0,2.0,0.0,2.0,2.0,2.0,3.1e-05,0.328125,0.471376
10,0.0,0.28125,0.394545,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.000149,0.28125,0.451376


KeyboardInterrupt: 

In [10]:
trainer.state.log_history

[{'loss': 0.0,
  'grad_norm': nan,
  'learning_rate': 0.0,
  'num_tokens': 21920.0,
  'completions/mean_length': 2.0,
  'completions/min_length': 2.0,
  'completions/max_length': 2.0,
  'completions/clipped_ratio': 0.0,
  'completions/mean_terminated_length': 2.0,
  'completions/min_terminated_length': 2.0,
  'completions/max_terminated_length': 2.0,
  'rewards/grpo_reward_func/mean': 0.3203125,
  'rewards/grpo_reward_func/std': 0.4684300124645233,
  'reward': 0.3203125,
  'reward_std': 0.36273446679115295,
  'frac_reward_zero_std': 0.125,
  'completion_length': 2.0,
  'kl': 0.0,
  'epoch': 0.013333333333333334,
  'step': 1},
 {'loss': -0.0,
  'grad_norm': nan,
  'learning_rate': 0.0,
  'num_tokens': 45360.0,
  'completions/mean_length': 2.0,
  'completions/min_length': 2.0,
  'completions/max_length': 2.0,
  'completions/clipped_ratio': 0.0,
  'completions/mean_terminated_length': 2.0,
  'completions/min_terminated_length': 2.0,
  'completions/max_terminated_length': 2.0,
  'rewards/g

In [19]:
print('Genering train data')
train_dataset = PointSetGRPODataset(game, num_samples=1200, difficulties = [1])

training_args = GRPOConfig(
    output_dir="./point_set_agent",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    optim="adamw_torch",
    max_steps=50,
    logging_steps=1,
    save_steps=50,
    eval_steps=25,
    warmup_steps=10,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    reward_funcs=[grpo_reward_func],
    preprocess_logits_for_metrics=lambda logits, labels: logits.argmax(dim=-1),
)

print("Starting training...")
trainer.train()
print("Training completed successfully!")

Genering train data


100%|██████████| 1/1 [00:00<00:00,  1.99it/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8
Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,200 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 8 x 1) = 128
 "-____-"     Trainable parameters = 3,686,400 of 3,089,625,088 (0.12% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / grpo_reward_func / mean,rewards / grpo_reward_func / std
1,0.0,0.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0
2,-0.0,0.289062,0.410563,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.0,0.289062,0.455108
3,-0.0,0.195312,0.336918,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.0,0.195312,0.397999
4,-0.0,0.429688,0.360076,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.0,0.429688,0.496977
5,-0.0,0.40625,0.374806,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.0,0.40625,0.493062
6,0.0,0.28125,0.38399,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,3.7e-05,0.28125,0.451376
7,0.0,0.15625,0.254603,2.453125,2.0,38.0,0.0,2.453125,2.0,38.0,No Log,No Log,No Log,No Log,No Log,2.7e-05,0.15625,0.364519
8,0.0,0.226562,0.296144,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,2.1e-05,0.226562,0.420252
9,0.0,0.4375,0.412922,2.359375,2.0,48.0,0.0,2.359375,2.0,48.0,No Log,No Log,No Log,No Log,No Log,1.7e-05,0.4375,0.498028
10,0.0,0.390625,0.341399,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,4.6e-05,0.390625,0.489808


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training completed successfully!


In [20]:
trainer.state.log_history

[{'loss': 0.0,
  'grad_norm': 0.0,
  'learning_rate': 0.0,
  'num_tokens': 49696.0,
  'completions/mean_length': 256.0,
  'completions/min_length': 256.0,
  'completions/max_length': 256.0,
  'completions/clipped_ratio': 1.0,
  'completions/mean_terminated_length': 0.0,
  'completions/min_terminated_length': 0.0,
  'completions/max_terminated_length': 0.0,
  'rewards/grpo_reward_func/mean': 0.0,
  'rewards/grpo_reward_func/std': 0.0,
  'reward': 0.0,
  'reward_std': 0.0,
  'frac_reward_zero_std': 1.0,
  'completion_length': 256.0,
  'kl': 0.0,
  'epoch': 0.013333333333333334,
  'step': 1},
 {'loss': -0.0,
  'grad_norm': nan,
  'learning_rate': 1.0000000000000002e-06,
  'num_tokens': 66576.0,
  'completions/mean_length': 2.0,
  'completions/min_length': 2.0,
  'completions/max_length': 2.0,
  'completions/clipped_ratio': 0.0,
  'completions/mean_terminated_length': 2.0,
  'completions/min_terminated_length': 2.0,
  'completions/max_terminated_length': 2.0,
  'rewards/grpo_reward_func/me

In [21]:
print('Genering train data')
train_dataset = PointSetGRPODataset(game, num_samples=1200, difficulties = [3])

training_args = GRPOConfig(
    output_dir="./point_set_agent",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    optim="adamw_torch",
    max_steps=50,
    logging_steps=1,
    save_steps=50,
    eval_steps=25,
    warmup_steps=10,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

trainer = GRPOTrainer(
    model=trainer.model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    reward_funcs=[grpo_reward_func],
    preprocess_logits_for_metrics=lambda logits, labels: logits.argmax(dim=-1),
)

print("Starting training...")
trainer.train()
print("Training completed successfully!")

Genering train data


100%|██████████| 1/1 [00:00<00:00,  1.13it/s]


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8
Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,200 | Num Epochs = 3 | Total steps = 200
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 8 x 1) = 128
 "-____-"     Trainable parameters = 3,686,400 of 3,089,625,088 (0.12% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / grpo_reward_func / mean,rewards / grpo_reward_func / std
1,0.0004,0.320312,0.367435,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0,0,0,0,0,0.051834,0.320312,0.46843
2,0.0006,0.414062,0.382697,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.073904,0.414062,0.494495
3,0.0005,0.375,0.394008,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.065984,0.375,0.486025
4,0.0005,0.34375,0.306156,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.058055,0.34375,0.476825
5,0.0005,0.421875,0.297741,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.057817,0.421875,0.495799
6,0.0005,0.523438,0.378221,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.060604,0.523438,0.501413
7,0.0005,0.367188,0.42477,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.067321,0.367188,0.483932
8,0.0004,0.476562,0.352709,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.046924,0.476562,0.501413
9,0.0006,0.4375,0.24489,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.069799,0.4375,0.498028
10,0.0006,0.710938,0.365313,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,0.072763,0.710938,0.455108


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


KeyboardInterrupt: 

In [22]:
trainer.state.log_history

[{'loss': 0.0004,
  'grad_norm': nan,
  'learning_rate': 0.0,
  'num_tokens': 23808.0,
  'completions/mean_length': 2.0,
  'completions/min_length': 2.0,
  'completions/max_length': 2.0,
  'completions/clipped_ratio': 0.0,
  'completions/mean_terminated_length': 2.0,
  'completions/min_terminated_length': 2.0,
  'completions/max_terminated_length': 2.0,
  'rewards/grpo_reward_func/mean': 0.3203125,
  'rewards/grpo_reward_func/std': 0.4684300124645233,
  'reward': 0.3203125,
  'reward_std': 0.36743485927581787,
  'frac_reward_zero_std': 0.25,
  'completion_length': 2.0,
  'kl': 0.051833580480888486,
  'epoch': 0.013333333333333334,
  'step': 1},
 {'loss': 0.0006,
  'grad_norm': nan,
  'learning_rate': 0.0,
  'num_tokens': 46992.0,
  'completions/mean_length': 2.0,
  'completions/min_length': 2.0,
  'completions/max_length': 2.0,
  'completions/clipped_ratio': 0.0,
  'completions/mean_terminated_length': 2.0,
  'completions/min_terminated_length': 2.0,
  'completions/max_terminated_lengt

In [None]:
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

In [None]:
from point_set_verifier import PointSetVerifier
from tqdm import tqdm
from data import Data

def evaluate_model(model, tokenizer, test_datasets, game):
    results = {}
    verifier = PointSetVerifier()

    model.eval()
    device = model.device

    for difficulty, dataset in tqdm(test_datasets.items()):
        correct = 0
        total = len(dataset)

        for item in dataset:
            prompt = format_prompt(item["question"])

            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=True,
                )

            prediction = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
            prediction = prediction.split("assistant")[-1].strip()
            print(PointSetVerifier().extract_answer(test_solution=prediction))
            print('-' * 100)
            print(item['answer'])

            data = Data(
                question=item['question'],
                answer=item['answer'],
                difficulty=item['difficulty'],
                metadata=item['metadata']
            )

            is_correct = verifier.verify(data, prediction)
            if is_correct:
                correct += 1

        accuracy = correct / total
        results[difficulty] = accuracy
        print(f"Difficulty {difficulty}: Accuracy = {accuracy:.3f}")

    return results

In [None]:
default_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)
print("Evaluating trained model...")
trained_results = evaluate_model(default_model, tokenizer, test_datasets, game)

==((====))==  Unsloth 2025.9.4: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Evaluating trained model...


  0%|          | 0/1 [00:00<?, ?it/s]

external
----------------------------------------------------------------------------------------------------
external
internal
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
external
external
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
boundary
boundary
----------------------------------------------------------------------------------------------------
external
external
----------------------------------------------------------------------------------------------------
boundary
boundary
---------------------------------------

100%|██████████| 1/1 [00:21<00:00, 21.08s/it]

external
----------------------------------------------------------------------------------------------------
boundary
Difficulty difficulty_3: Accuracy = 0.270



