In [None]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

In [2]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.75, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-05 06:51:04 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 03-05 06:51:04 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-05 06:51:08 worker.py:267] Memory profiling takes 3.25 seconds
INFO 03-05 06:51:08 worker.py:267] the current vLLM instance can use total_gpu_memory (11.51GiB) x gpu_memory_utilization (0.67) = 7.74GiB
INFO 03-05 06:51:08 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.05GiB; the rest of the memory reserved for KV Cache is 4.44GiB.
INFO 03-05 06:51:08 executor_base.py:111] # cuda blocks: 8085, # CPU blocks: 10922
INFO 03-05 06:51:08 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 63.16x
INFO 03-05 06:51:16 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error oc

Capturing CUDA graph shapes: 100%|███████████████████████████████| 27/27 [00:29<00:00,  1.08s/it]

INFO 03-05 06:51:45 model_runner.py:1562] Graph capturing finished in 29 secs, took 0.64 GiB
INFO 03-05 06:51:45 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 40.87 seconds



Unsloth 2025.2.15 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [3]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print("\n"*2, '-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    rewards = [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]
    #print(rewards)
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def strict_int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    # TODO - reward 'more' like digits..
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def soft_int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [
        0.5 * (sum(ch.isdigit() for ch in r) / len(r)) if r else 0.0
        for r in extracted_responses
    ]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [4]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 800,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

In [5]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        soft_int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 250
 "-____-"     Number of trainable parameters = 119,734,272




 -------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
To find out how much Mr. Benson paid in total for the concert tickets, we need to follow these steps:

1. First, identify the number of tickets that qualify for the discount. Since Mr. Benson bought 12 tickets and the discount applies to the tickets bought over 10, we subtract 10 from 12 to find that 2 tickets qualify for a 5% discount.
2. Calculate the discount for the 2 extra tickets: The discount on each ticket over 10 is 5% of $40. The discount per additional ticket is calculated as \( 40 \times 0.05 = 2 \) dollars. For 2 extra tickets, the total discount is \( 2 \times 2 = 4 \) dollars.
3. Calculate the total cost for the tickets without any discount. Since 12 tickets at $40 each would normally cost \( 12 \times 40 = 480 \) dollars, we subtract the disco

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / soft_int_reward_func,rewards / correctness_reward_func
1,-0.0,-0.634721,0.50782,413.25,0.0,-0.692969,0.0,0.0,0.058247,0.0
2,0.0,-0.210435,0.516326,314.21875,0.0,-0.410437,0.0,0.0,0.075003,0.125
3,0.0001,-0.172454,0.417494,260.0,0.001484,-0.372781,0.0,0.0,0.075328,0.125
4,0.0,-0.284984,0.656352,299.90625,0.000541,-0.554875,0.0,0.0,0.082391,0.1875
5,0.0,-0.382447,0.16915,289.34375,0.000443,-0.472719,0.0,0.0,0.090272,0.0
6,0.0,-0.124083,0.403027,240.09375,0.000552,-0.359719,0.0,0.0,0.110635,0.125
7,0.0,-0.197134,0.17779,206.125,0.000606,-0.242813,0.0,0.0,0.045679,0.0
8,0.0,-0.120247,0.373922,208.6875,0.000477,-0.225781,0.0,0.0,0.043034,0.0625
9,0.0,-0.371099,0.269364,286.25,0.000495,-0.410969,0.0,0.0,0.03987,0.0
10,0.0001,-0.074926,0.645678,266.5,0.002665,-0.388875,0.0,0.0,0.126449,0.1875




 -------------------- Question:
Over the past five years, on July 4th, the high temperature for Washington, DC has been: 90 degrees in 2020, 90 degrees in 2019, 90 degrees in 2018, 79 degrees in 2017 and 71 degrees in 2016. What is the average temperature for July 4th in Washington, DC over the past 5 years? 
Answer:
84 
Response:
<reasoning>
To find the average temperature for July 4th in Washington, DC over the past 5 years, we need to sum up all the temperatures recorded and then divide by the number of years (5). The recorded temperatures are 90 degrees in 2020, 90 degrees in 2019, 90 degrees in 2018, 79 degrees in 2017, and 71 degrees in 2016. 

First, sum the temperatures: 
\[ 90 + 90 + 90 + 79 + 71 \]

Then, divide the total sum by the number of years (5) to find the average.
</reasoning>

<answer>
First, we sum up the temperatures:
\[ 90 + 90 + 90 + 79 + 71 = 410 \]

Next, we divide the sum by the number of years (5) to calculate the average:
\[ \frac{410}{5} = 82 \]

So, the

TrainOutput(global_step=250, training_loss=0.00397241159375821, metrics={'train_runtime': 18554.1938, 'train_samples_per_second': 0.431, 'train_steps_per_second': 0.013, 'total_flos': 0.0, 'train_loss': 0.00397241159375821})

In [None]:
import gc
gc.collect()

In [None]:
# OOM :(
model.save_pretrained_gguf('ht/soft1_math', tokenizer)

In [6]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "I was offered $10,000,000 a day for a month or $1 the first day, $2 the next, and doubling every day for the month. What is better? "},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 2048,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

output

Processed prompts: 100%|█| 1/1 [00:07<00:00,  7.76s/it, est. speed input: 9.41 toks/s, output: 63


"To determine which offer is better, we need to calculate the total amount received for both options and compare them.\n\n### Option 1: $10,000,000 a day for a month\n\nThere are 30 days in a month (assuming it's not a leap year). So, the total amount received would be:\n\\[ 10,000,000 \\times 30 = 300,000,000 \\text{ dollars} \\]\n\n### Option 2: $1 the first day, $2 the next, and doubling every day for the month\n\nThis is an example of exponential growth. The total amount received after \\( n \\) days (where \\( n = 30 \\)) can be calculated as:\n\\[ 1 + 2 + 4 + 8 + \\ldots + 2^{30} \\]\nThe sum of a geometric series where each term is a power of 2 can be calculated using the formula for the sum of the first \\( n \\) terms of a geometric series:\n\\[ S = a \\frac{r^n - 1}{r - 1} \\]\nHere, \\( a = 1 \\), \\( r = 2 \\), and \\( n = 30 \\):\n\\[ S = 1 \\cdot \\frac{2^{30} - 1}{2 - 1} = 2^{30} - 1 \\]\nCalculating \\( 2^{30} \\):\n\\[ 2^{30} = 1,073,741,824 \\]\nSo, the total amount r