In [1]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.75, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

INFO 03-06 06:44:33 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.15: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.515 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 68.64%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 11.51 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 5.49 GB. Also swap space = 6 GB.
INFO 03-06 06:44:46 config.py:549] This model supports multiple tasks: {'reward', 'generate', 'classify', 'embed', 's



INFO 03-06 06:44:48 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-06 06:44:51 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 03-06 06:44:51 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-06 06:44:55 worker.py:267] Memory profiling takes 3.32 seconds
INFO 03-06 06:44:55 worker.py:267] the current vLLM instance can use total_gpu_memory (11.51GiB) x gpu_memory_utilization (0.69) = 7.90GiB
INFO 03-06 06:44:55 worker.py:267] model weights take 2.22GiB; non_torch_memory takes -0.01GiB; PyTorch activation peak memory takes 1.05GiB; the rest of the memory reserved for KV Cache is 4.64GiB.
INFO 03-06 06:44:55 executor_base.py:111] # cuda blocks: 8450, # CPU blocks: 10922
INFO 03-06 06:44:55 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 66.02x
INFO 03-06 06:45:02 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|███████████████████████████████| 27/27 [00:27<00:00,  1.03s/it]

INFO 03-06 06:45:30 model_runner.py:1562] Graph capturing finished in 28 secs, took 0.65 GiB
INFO 03-06 06:45:30 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 39.43 seconds



Unsloth 2025.2.15 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [3]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print("\n"*2, '-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")

    extracted_responses = [e.split('\n')[0] for e in extracted_responses]
    
    rewards = [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]
    #print(rewards)
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]


from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('BAAI/bge-m3')
charles = embedder.encode(['What can we do to make that number higher?'])


def costs_have_a_role_landing_excess_sales(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    extracted_responses = [e.split('\n')[-1] for e in extracted_responses]
    embedded = embedder.encode(extracted_responses).T
    reward = (5 * (charles.dot(embedded) - .5)).reshape(-1)
    print('charles')
    print(reward)
    return reward


def strict_int_reward_func(completions, **kwargs) -> list[float]:
    raise NotImplementedError()
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def soft_int_reward_func(completions, **kwargs) -> list[float]:
    # Charles ready
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    extracted_responses = [e.split('\n') for e in extracted_responses]
    reward = [
        (0.5 * (sum(ch.isdigit() for ch in r[0]) / len(r[0])) if r else 0.0) + (.5 if len(r) == 2 else 0)
        for r in extracted_responses
    ]
    print('soft int')
    print(reward)

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [4]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 800,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

In [None]:
import gc
gc.collect()
import torch
torch.

In [5]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        correctness_reward_func,
        costs_have_a_role_landing_excess_sales
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 250
 "-____-"     Number of trainable parameters = 119,734,272




 -------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
To find out how much Mr. Benson paid in total for the concert tickets, we need to follow these steps:

1. First, identify the number of tickets that qualify for the discount. Since Mr. Benson bought 12 tickets and the discount applies to the tickets bought over 10, we subtract 10 from 12 to find that 2 tickets qualify for a 5% discount.
2. Calculate the discount for the 2 extra tickets: The discount on each ticket over 10 is 5% of $40. The discount per additional ticket is calculated as \( 40 \times 0.05 = 2 \) dollars. For 2 extra tickets, the total discount is \( 2 \times 2 = 4 \) dollars.
3. Calculate the total cost for the tickets without any discount. Since 12 tickets at $40 each would normally cost \( 12 \times 40 = 480 \) dollars, we subtract the disco

  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)


OutOfMemoryError: CUDA out of memory. Tried to allocate 68.00 MiB. GPU 0 has a total capacity of 11.51 GiB of which 12.56 MiB is free. Including non-PyTorch memory, this process has 10.63 GiB memory in use. Of the allocated memory 9.90 GiB is allocated by PyTorch, with 66.00 MiB allocated in private pools (e.g., CUDA Graphs), and 43.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# OOM :(
model.save_pretrained_gguf('ht/soft1_math', tokenizer)

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "I was offered $10,000,000 a day for a month or $1 the first day, $2 the next, and doubling every day for the month. What is better? "},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 2048,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

In [8]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('BAAI/bge-m3')
charles = embedder.encode(['What can we do to make that number higher?'])
charles

array([[-0.03369291,  0.03542871, -0.04332189, ..., -0.03394005,
         0.03219636,  0.04624934]], dtype=float32)

In [11]:
responses = embedder.encode(['answer is 4', 'what if we spent more on GPUs?', 'we need this to be higher next week']).T

In [37]:
(charles.dot(responses)).reshape(-1)

array([0.5726313 , 0.6143315 , 0.75038755], dtype=float32)