## **Synthetic vs Original GSM8K Testing**

### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm

In [2]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

### Unsloth

Load up `Qwen 2.5 3B Instruct`, and set parameters

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

### Data Prep (both synthetic and original)
<a name="Data"></a>

We directly leverage [@willccbb](https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb) for data prep and all reward functions.

In [4]:
import random
import numpy as np
import torch
import re
import json
import pandas as pd
from datasets import load_dataset, Dataset
from huggingface_hub import login
from google.colab import auth

# Authentication for Hugging Face & Colab
auth.authenticate_user()
login(token='HF TOKEN')  # HF token to replace

# Set a random seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)  # Ensure reproducibility

# System prompt format
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

# Template for XML responses
XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

# Extract answer from XML-formatted response
def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

# Extract answer from a hash format (if applicable)
def extract_hash_answer(text: str | None) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# Load synthetic GSM8K dataset from Hugging Face Hub
def load_synthetic_gsm8k(split="train") -> Dataset:
    splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
    file_path = "hf://datasets/gretelai/gsm8k-synthetic-diverse-8b/" + splits[split]

    synthetic_df = pd.read_parquet(file_path)
    dataset = Dataset.from_pandas(synthetic_df)  # Convert Pandas DataFrame to Hugging Face Dataset
    return dataset

# Load real GSM8K dataset from OpenAI
def load_real_gsm8k(split="train") -> Dataset:
    return load_dataset('openai/gsm8k', 'main')[split]

# Unified function to get GSM8K questions (real or synthetic)
def get_gsm8k_questions(source="real", split="train") -> Dataset:
    if source == "synthetic":
        data = load_synthetic_gsm8k(split)
    else:
        data = load_real_gsm8k(split)

    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })

    return data

# Load datasets
synth_dataset = get_gsm8k_questions(source="synthetic", split="train")
real_dataset = get_gsm8k_questions(source="real", split="train")

Map:   0%|          | 0/1528 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [5]:
# Reward functions (from @willcob)
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]


<a name="Train"></a>
### Train the model

GRPO Trainer and configuration set up:

In [6]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 8


In [7]:
# train synthetic
synth_trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = synth_dataset,
)
synth_trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,528 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 119,734,272/3,000,000,000 (3.99% trained)


-------------------- Question:
Maya and her classmates are going on a camping trip. They have 15 big bags of marshmallows to roast over a campfire. If they eat 3 bags each day for 4 days, how many bags of marshmallows will they have left after the trip? 
Answer:
3 
Response:
<reasoning>
Maya and her classmates have 15 bags of marshmallows at the start. If they eat 3 bags per day for 4 days, the total number of bags they will consume is \(3 \times 4 = 12\) bags. To find out how many bags of marshmallows they will have left, we subtract the number of bags they consume from the total number of bags they have initially: \(15 - 12 = 3\) bags.

</reasoning>
<answer>
Maya and her classmates will have 3 bags of marshmallows left after the 4 days of roasting.
</answer> 
Extracted:
Maya and her classmates will have 3 bags of marshmallows left after the 4 days of roasting.


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
1,0.0,0.0845,0.135857,179.875,0.0,0.0845,0.0,0.0,0.0,0.0
2,0.0,0.011625,0.212246,200.0,0.0,0.011625,0.0,0.0,0.0,0.0
3,0.0,-0.14625,0.247462,193.625,8e-06,-0.14625,0.0,0.0,0.0,0.0
4,0.0,0.032,0.215349,200.0,8e-06,0.032,0.0,0.0,0.0,0.0
5,0.0,0.031125,0.188104,199.25,1.1e-05,0.031125,0.0,0.0,0.0,0.0
6,0.0,-0.0355,0.177634,140.375,2e-05,-0.0355,0.0,0.0,0.0,0.0
7,0.0,-0.100125,0.21072,196.125,1.1e-05,-0.100125,0.0,0.0,0.0,0.0
8,0.0,-0.0235,0.073148,153.75,9e-06,-0.0235,0.0,0.0,0.0,0.0
9,0.0,0.08225,0.120915,200.0,2.2e-05,0.08225,0.0,0.0,0.0,0.0
10,0.0,0.024,0.298903,176.125,1.7e-05,0.024,0.0,0.0,0.0,0.0


-------------------- Question:
Amina's friends are organizing a large Eid-al-Fitr celebration. They need to prepare 250 meatballs for the guests, and each tray holds 25 meatballs. If they want to fill 8 trays, how many more trays do they need to buy to have enough meat for the dinner? 
Answer:
2 
Response:
<reasoning>
To determine how many more trays are needed, we first need to find out how many meatballs can be prepared with the available trays and then compare this to the total required.

Given that each tray holds 雞蛋 (meatballs) and they want to fill 8 trays, the total number of meatballs they can initially prepare is:
\[ 8 \text{ trays} \times 25 \text{ meatballs/tray} = 200 \text{ meatballs} \]

Since there are 250 meatballs needed, we can find out how many more meatballs are needed:
\[ 250 \text{ meatballs} - 200 \text{ meatballs} = 50 \text{ meatballs} \]

Given that each tray holds 25 meatballs, we can calculate how many more trays are needed:
\[ \frac{50 \text{ meatballs}}{25

TrainOutput(global_step=250, training_loss=0.004059133145853224, metrics={'train_runtime': 5149.0095, 'train_samples_per_second': 0.388, 'train_steps_per_second': 0.049, 'total_flos': 0.0, 'train_loss': 0.004059133145853224})

In [8]:
model.save_lora("grpo_synth_lora")

In [None]:
# train real
real_trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = real_dataset,
)
real_trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 119,734,272/3,000,000,000 (3.99% trained)


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
Mr. Benson bought 12 tickets, which means he qualified for the 5% discount on the 2 additional tickets he bought beyond the initial 10. So, the discount on 2 tickets amounts to 2 * $40 * 5% = $4. The total cost for the first 10 tickets is 10 * $40 = $400. For the 2 extra tickets, he pays $40 each with a 5% discount, which means he pays $40 - (5% of $40) = $40 - $2 = $38 each. So, the cost for the 2 extra tickets is 2 * $38 = $76. Adding the cost of the first 10 tickets and the 2 discounted tickets, Mr. Benson paid $400 + $76 = $476.
</reasoning>
<answer>
4 
Extracted:
4


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
1,0.003,1.085375,1.586457,189.125,0.074124,0.147875,0.0,0.1875,0.25,0.5
2,0.0023,0.58225,1.123394,193.375,0.056295,0.14475,0.0,0.0625,0.125,0.25
3,0.0031,1.984375,1.68162,185.125,0.078687,0.359375,0.0,0.3125,0.3125,1.0
4,0.0037,3.5,0.0,147.0,0.092563,0.5,0.0,0.5,0.5,2.0
5,0.0058,2.5,1.069045,136.625,0.145239,0.5,0.0,0.5,0.5,1.0
6,0.0019,1.17875,1.489273,197.25,0.047037,0.17875,0.0,0.0625,0.1875,0.75
7,0.001,0.578125,1.181965,195.875,0.025336,0.203125,0.0,0.0625,0.0625,0.25
8,0.0054,2.297875,1.064817,159.75,0.135934,0.422875,0.0,0.4375,0.4375,1.0
9,0.0032,3.5,0.0,151.125,0.079992,0.5,0.0,0.5,0.5,2.0
10,0.0036,0.546875,1.193243,196.25,0.090395,0.171875,0.0,0.0625,0.0625,0.25


-------------------- Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
<reasoning>
To find out how much more Jane's monthly payment would be for a house compared to a trailer, we need to calculate the monthly payments for both options. We can find the monthly payment by dividing the total cost by the number of months in 20 years (20 * 12 = 240 months). So, for the house, the monthly payment would be \( \frac{480,000}{240} \) and for the trailer, it would be \( \frac{120,000}{240} \). After finding these amounts, we subtract the trailer's monthly payment from the house's monthly payment to find the difference.
</reasoning>
<answer>
100
</answer>
 
Extracted:
100
-------------------- Question:
Janet pays $40/hour for 3 hours per week of clarinet lessons a

In [None]:
model.save_lora("grpo_real_lora")

<a name="Inference"></a>
### Inference Comparison
Now let's try the models we trained on different data

In [None]:
model.load_lora("grpo_synth_lora")
# → test and log output

model.load_lora("grpo_real_lora")
# → test and compare output


In [None]:
real_test_dataset = get_gsm8k_questions(source="real", split="test")
synth_test_dataset = get_gsm8k_questions(source="synthetic", split="test")


In [None]:
# ✅ Evaluation pipeline for GRPO-trained model

from vllm import SamplingParams

# --- Extract XML reasoning and answer ---
def extract_xml_reasoning_and_answer(text: str):
    try:
        reasoning_start = text.find("<reasoning>") + len("<reasoning>")
        reasoning_end = text.find("</reasoning>")
        answer_start = text.find("<answer>") + len("<answer>")
        answer_end = text.find("</answer>")
        reasoning = text[reasoning_start:reasoning_end].strip()
        answer = text[answer_start:answer_end].strip()
        return reasoning, answer
    except:
        return "", ""

# --- Format prompt for vLLM ---
def format_prompt(prompt_dict):
    return tokenizer.apply_chat_template(
        prompt_dict,
        tokenize=False,
        add_generation_prompt=True
    )

# --- Full evaluation function ---
def evaluate_model(model, dataset, lora_path=None, num_samples=200):
    dataset = dataset.select(range(min(len(dataset), num_samples)))
    prompts = [format_prompt(example["prompt"]) for example in dataset]
    true_answers = [example["answer"] for example in dataset]

    # Load LoRA if needed
    lora_request = model.load_lora(lora_path) if lora_path else None

    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=1024,
    )

    completions = model.fast_generate(
        prompts,
        sampling_params=sampling_params,
        lora_request=lora_request,
    )

    correct = 0
    step_counts = []
    formatting_success = 0

    for i in range(len(prompts)):
        generated_text = completions[i][0]["content"]
        reasoning, pred_answer = extract_xml_reasoning_and_answer(generated_text)
        true_answer = true_answers[i].strip()

        # Accuracy
        if pred_answer.strip() == true_answer:
            correct += 1

        # Reasoning steps
        step_counts.append(reasoning.count("\n") + 1 if reasoning else 0)

        # Formatting check
        if "<reasoning>" in generated_text and "</answer>" in generated_text:
            formatting_success += 1

    return {
        "accuracy": correct / len(prompts),
        "avg_steps": sum(step_counts) / len(step_counts),
        "formatting_rate": formatting_success / len(prompts),
    }


In [None]:
# --- Run evaluation ---
metrics = evaluate_model(
    model=model,
    dataset=test_dataset,  # make sure this is defined above
    lora_path="grpo_real_lora",
    num_samples=200  # or use full dataset
)

# --- Print results ---
print(f"✅ Accuracy:             {metrics['accuracy']:.2%}")
print(f"🧠 Avg Reasoning Steps:  {metrics['avg_steps']:.2f}")
print(f"🧾 XML Format Success:   {metrics['formatting_rate']:.2%}")

In [None]:
# --- Run evaluation ---
metrics = evaluate_model(
    model=model,
    dataset=test_dataset,  # make sure this is defined above
    lora_path="grpo_synth_lora",
    num_samples=200  # or use full dataset
)

# --- Print results ---
print(f"✅ Accuracy:             {metrics['accuracy']:.2%}")
print(f"🧠 Avg Reasoning Steps:  {metrics['avg_steps']:.2f}")
print(f"🧾 XML Format Success:   {metrics['formatting_rate']:.2%}")