In [1]:
pip install unsloth  trl

Collecting unsloth
  Downloading unsloth-2025.9.4-py3-none-any.whl.metadata (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting unsloth_zoo>=2025.9.5 (from unsloth)
  Downloading unsloth_zoo-2025.9.5-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting cut_cr

In [3]:
import torch
from torch.utils.data import IterableDataset, Dataset
from datasets import Dataset as HFDataset
import random
from typing import Optional, List
import numpy as np
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
import matplotlib.pyplot as plt
import json
from huggingface_hub import HfApi
from point_set import PointSet
from tqdm import tqdm

In [4]:
class PointSetIterableDataset(IterableDataset):
    def __init__(self, game, num_samples=1000, difficulty=None):
        self.game = game
        self.num_samples = num_samples
        self.difficulty = difficulty

    def __iter__(self):
        for _ in range(self.num_samples):
            game_data_list = self.game.generate(
                num_of_questions=1,
                difficulty=self.difficulty if self.difficulty else random.randint(1, 10)
            )

            if game_data_list:
                data = game_data_list[0]
                yield {
                    "question": data.question,
                    "answer": data.answer,
                    "difficulty": data.difficulty,
                    "metadata": data.metadata
                }

In [5]:
def create_test_datasets(game, difficulties=[1, 3, 5, 7, 10], samples_per_difficulty=100):
    test_datasets = {}

    for difficulty in difficulties:
        print(f"Creating test dataset for difficulty {difficulty}...")
        game_data_list = game.generate(
            num_of_questions=samples_per_difficulty,
            difficulty=difficulty
        )

        dataset_data = []
        for data in game_data_list:
            dataset_data.append({
                "question": data.question,
                "answer": data.answer,
                "difficulty": data.difficulty,
                "metadata": data.metadata
            })

        test_datasets[f"difficulty_{difficulty}"] = HFDataset.from_list(dataset_data)

    return test_datasets


def format_prompt(question):
    return f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""


def correctness_reward_func(game, examples, predictions, **kwargs):
    rewards = []
    for i, (example, prediction) in enumerate(zip(examples, predictions)):
        data = type('Data', (), {
            'question': example['question'],
            'answer': example['answer'],
            'difficulty': example['difficulty'],
            'metadata': example['metadata']
        })()

        is_correct = game.verify(data, prediction)
        rewards.append(1.0 if is_correct else 0)

    return rewards

In [6]:

import os
os.environ["WANDB_DISABLED"] = "true"

In [7]:
import unsloth
from unsloth import FastLanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer
from point_set import PointSet
from point_set_verifier import PointSetVerifier
import torch

class PointSetGRPODataset(torch.utils.data.Dataset):
    def __init__(self, game, num_samples=5000, difficulties=[1, 3, 5, 7, 10]):
        self.game = game
        self.num_samples = num_samples
        self.data = []


        for i in tqdm(difficulties):
          for _ in range(num_samples):
              data_point = game.generate(num_of_questions=1,)
              prompt = format_prompt(data_point[0].question)
              self.data.append({
                  "prompt": prompt,
                  "question": data_point[0].question,
                  "answer": data_point[0].answer,
                  "difficulty": data_point[0].difficulty,
                  "metadata": data_point[0].metadata
              })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def preprocess_function(examples, tokenizer):
    model_inputs = tokenizer(
        examples["prompt"],
        max_length=1024,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    return model_inputs

def format_prompt(question):
    SYSTEM_PROMPT = """You are an expert in point set topology. Answer ONLY with one word: 'internal', 'boundary', or 'external'. Do not include any other text, explanations, or formatting."""
    return f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""


game = PointSet()

print('Genering train data')
train_dataset = PointSetGRPODataset(game, num_samples=1200, difficulties = [3])
print('Genering test data')
test_datasets = create_test_datasets(game,difficulties=[3])

model_name = "Qwen/Qwen2.5-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

training_args = GRPOConfig(
    output_dir="./point_set_agent",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    optim="adamw_torch",
    max_steps=100,
    logging_steps=1,
    save_steps=50,
    eval_steps=25,
    warmup_steps=10,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

Genering train data


100%|██████████| 1/1 [00:00<00:00,  1.97it/s]


Genering test data
Creating test dataset for difficulty 3...
==((====))==  Unsloth 2025.9.4: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.36G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.9.4 patched 36 layers with 36 QKV layers, 36 O layers and 0 MLP layers.


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


log_file =  f"training_log.json"
def grpo_reward_func(prompts, completions, **kwargs):
    rewards = []
    verifier = PointSetVerifier()

    step_logs = []

    prompt_to_data = {}
    for item in train_dataset.data:
        prompt_to_data[item["prompt"]] = item

    for i, (prompt, completion) in enumerate(zip(prompts, completions)):
        if prompt in prompt_to_data:
            data_item = prompt_to_data[prompt]

            class Data:
                def __init__(self, question, answer, difficulty, metadata):
                    self.question = question
                    self.answer = answer
                    self.difficulty = difficulty
                    self.metadata = metadata

            data = Data(
                question=data_item['question'],
                answer=data_item['answer'],
                difficulty=data_item['difficulty'],
                metadata=data_item['metadata']
            )

            is_correct = verifier.verify(data, completion)
            reward = 1.0 if is_correct else 0
            rewards.append(reward)

            step_log = {
                "completion": completion,
                "expected_answer": data_item['answer'],
                "is_correct": is_correct,
                "reward": float(reward),
            }
            step_logs.append(step_log)

        else:
            rewards.append(0.0)
            step_log = {
                "completion": completion,
                "error": "prompt_not_found_in_dataset",
                "reward": 0.0
            }
            step_logs.append(step_log)

    with open(log_file, 'a', encoding='utf-8') as f:
        for log_entry in step_logs:
            f.write(json.dumps(log_entry, ensure_ascii=False) + '\n')

    return torch.tensor(rewards, dtype=torch.float32)

trainer = GRPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    reward_funcs=[grpo_reward_func],
    preprocess_logits_for_metrics=lambda logits, labels: logits.argmax(dim=-1),
)

print("Starting training...")
trainer.train()
print("Training completed successfully!")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,200 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 8 x 1) = 64
 "-____-"     Trainable parameters = 3,686,400 of 3,089,625,088 (0.12% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / grpo_reward_func / mean,rewards / grpo_reward_func / std
1,0.0,0.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0,0.34375,0.300382,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.34375,0.478714
3,0.0,0.078125,0.111009,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.078125,0.27049
4,-0.0,0.171875,0.180421,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.171875,0.380254
5,0.0,0.296875,0.24831,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.296875,0.460493
6,-0.0,0.375,0.275639,2.0,2.0,2.0,0.0,2.0,2.0,2.0,7e-06,0.375,0.48795
7,0.0,0.515625,0.289309,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.8e-05,0.515625,0.503707
8,0.0,0.203125,0.210946,2.0,2.0,2.0,0.0,2.0,2.0,2.0,8e-06,0.203125,0.405505
9,0.0,0.234375,0.24147,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.7e-05,0.234375,0.426956
10,0.0,0.265625,0.299334,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.9e-05,0.265625,0.445157


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / grpo_reward_func / mean,rewards / grpo_reward_func / std
1,0.0,0.0,0.0,256.0,256.0,256.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0,0.34375,0.300382,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.34375,0.478714
3,0.0,0.078125,0.111009,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.078125,0.27049
4,-0.0,0.171875,0.180421,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.171875,0.380254
5,0.0,0.296875,0.24831,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.296875,0.460493
6,-0.0,0.375,0.275639,2.0,2.0,2.0,0.0,2.0,2.0,2.0,7e-06,0.375,0.48795
7,0.0,0.515625,0.289309,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.8e-05,0.515625,0.503707
8,0.0,0.203125,0.210946,2.0,2.0,2.0,0.0,2.0,2.0,2.0,8e-06,0.203125,0.405505
9,0.0,0.234375,0.24147,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.7e-05,0.234375,0.426956
10,0.0,0.265625,0.299334,2.0,2.0,2.0,0.0,2.0,2.0,2.0,1.9e-05,0.265625,0.445157


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training completed successfully!


In [None]:
print(trainer.state.log_history)

[{'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 0.0, 'num_tokens': 26664.0, 'completions/mean_length': 256.0, 'completions/min_length': 256.0, 'completions/max_length': 256.0, 'completions/clipped_ratio': 1.0, 'completions/mean_terminated_length': 0.0, 'completions/min_terminated_length': 0.0, 'completions/max_terminated_length': 0.0, 'rewards/grpo_reward_func/mean': 0.0, 'rewards/grpo_reward_func/std': 0.0, 'reward': 0.0, 'reward_std': 0.0, 'frac_reward_zero_std': 1.0, 'completion_length': 256.0, 'kl': 0.0, 'epoch': 0.006666666666666667, 'step': 1}, {'loss': -0.0, 'grad_norm': nan, 'learning_rate': 1.0000000000000002e-06, 'num_tokens': 37376.0, 'completions/mean_length': 2.0, 'completions/min_length': 2.0, 'completions/max_length': 2.0, 'completions/clipped_ratio': 0.0, 'completions/mean_terminated_length': 2.0, 'completions/min_terminated_length': 2.0, 'completions/max_terminated_length': 2.0, 'rewards/grpo_reward_func/mean': 0.34375, 'rewards/grpo_reward_func/std': 0.4787135720252

In [None]:
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [16]:
from point_set_verifier import PointSetVerifier
from tqdm import tqdm
from data import Data

def evaluate_model(model, tokenizer, test_datasets, game):
    results = {}
    verifier = PointSetVerifier()

    model.eval()
    device = model.device

    for difficulty, dataset in tqdm(test_datasets.items()):
        correct = 0
        total = len(dataset)

        for item in dataset:
            prompt = format_prompt(item["question"])

            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=True,
                )

            prediction = tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
            prediction = prediction.split("assistant")[-1].strip()
            print(PointSetVerifier().extract_answer(test_solution=prediction))
            print('-' * 100)
            print(item['answer'])

            data = Data(
                question=item['question'],
                answer=item['answer'],
                difficulty=item['difficulty'],
                metadata=item['metadata']
            )

            is_correct = verifier.verify(data, prediction)
            if is_correct:
                correct += 1

        accuracy = correct / total
        results[difficulty] = accuracy
        print(f"Difficulty {difficulty}: Accuracy = {accuracy:.3f}")

    return results

In [17]:
default_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)
print("Evaluating trained model...")
trained_results = evaluate_model(default_model, tokenizer, test_datasets, game)

==((====))==  Unsloth 2025.9.4: Fast Qwen2 patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Evaluating trained model...


  0%|          | 0/1 [00:00<?, ?it/s]

external
----------------------------------------------------------------------------------------------------
external
internal
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
external
external
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
boundary
external
----------------------------------------------------------------------------------------------------
boundary
boundary
----------------------------------------------------------------------------------------------------
external
external
----------------------------------------------------------------------------------------------------
boundary
boundary
---------------------------------------

100%|██████████| 1/1 [00:21<00:00, 21.08s/it]

external
----------------------------------------------------------------------------------------------------
boundary
Difficulty difficulty_3: Accuracy = 0.270



