## Synthetic Data Training


In [None]:
# Kaggle-compatible setup (Unsloth + GRPO + Gemma-3)
!pip install --upgrade pip -q
!pip install unsloth trl peft accelerate bitsandbytes datasets huggingface_hub -q
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3 -q

## Load up Gemma3-1B and Set Parameters

In [None]:
from unsloth import FastModel
import torch
max_seq_length = 1024

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)



Add LORA adaptors to update only a small number of parameters.

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    r = 8,
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)


In [None]:
from peft import get_peft_model_state_dict

# Check if LoRA weights are being tracked
peft_state = get_peft_model_state_dict(model)
print(f"LoRA parameters being trained: {len(peft_state)}")
model.print_trainable_parameters()


## Data Preparation of Synthetic Data

This data is taken from Hugging Face, created from [Gretel Navigator with meta-llama](https://huggingface.co/datasets/gretelai/gsm8k-synthetic-diverse-8b). It contains ~1500 Training and 300 Test Grade School Math Problems.

In [None]:
! pip install datasets

In [None]:
from datasets import Dataset
import pandas as pd

def load_synthetic_gsm8k(split="train"):
    base_path = "hf://datasets/gretelai/gsm8k-synthetic-diverse-8b/data/"
    file_map = {
        "train": "train-00000-of-00001.parquet",
        "test": "test-00000-of-00001.parquet"
    }
    if split not in file_map:
        raise ValueError("Split must be 'train' or 'test'")

    df = pd.read_parquet(base_path + file_map[split])
    dataset = Dataset.from_pandas(df)
    return dataset

In [None]:
train_set = load_synthetic_gsm8k("train")
test_set = load_synthetic_gsm8k("test")

print(f"Train set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

# View a sample
print("\nTrain example:")
print(train_set[0])

print("\nTest example:")
print(test_set[0])


### Check the structure with question & answer

In [None]:
train_set[0]["question"]

In [None]:
train_set[0]["answer"]

### Notice the answer has a ####, so we extract that

In [None]:
def extract_hash_answer(text):
    if "####" not in text: return None
    return text.split("####")[1].strip()
extract_hash_answer(train_set[0]["answer"])

We now create a system prompt which can be customized. We add 4 extra symbols for working out or thinking / reasoning sections and a final answer:

In [None]:
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""
system_prompt

Let's map our synthetic dataset and observe the first row:

In [None]:
# map training dataset
train_dataset = train_set.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]),
})
train_dataset[0]

In [None]:
test_dataset = test_set.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]),
})
test_dataset[0]

We create a regex format to match the reasoning sections and answers:

In [None]:
import re

match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

In [None]:
# verify that it works
match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

Create a reward function to match the format exactly - we reward it with 3 points if it succeeds:

In [None]:
def match_format_exactly(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

In [None]:
# if it fails, give it partial rewards
def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Count how many keywords are seen - we penalize if too many!
        # If we see 1, then plus some points!
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

Finally, we want to extract the generated answer, and reward or penalize it! We also reward it based on how close the answer is to the true one via ratios:

In [None]:
def check_answer(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # Correct answer gets 3 points!
        if guess == true_answer:
            score += 3.0
        # Match if spaces are seen
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # We also reward it if the answer is close via ratios!
            # Ie if the answer is within some range, reward it!
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                else: score -= 1.0 # Penalize wrong answers
            except:
                score -= 0.5 # Penalize
        scores.append(score)
    return scores

In [None]:
# The answer may not come as a single number, let's account for that:
match_numbers = re.compile(
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)
match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>")

In [None]:
def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            guess       = float(guess.strip())
            scores.append(1.5 if guess == true_answer else 0.0)
        except:
            scores.append(0)
            continue
    return scores

<a name="Train"></a>
### Training the model

Now set up GRPO Trainer and all configurations!

In [None]:
from huggingface_hub import upload_folder
from transformers import TrainerCallback
import os
from huggingface_hub import login

# Login first
login(token="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")

In [None]:
class HFPushCallback(TrainerCallback):
    def __init__(self, model, repo_id):
        self.model = model
        self.repo_id = repo_id

    def on_save(self, args, state, control, **kwargs):
        step = state.global_step
        adapter_path = f"checkpoint_lora_step_{step}"
        print(f"💾 Saving LoRA at {adapter_path} and uploading to HF...")

        # Save LoRA adapter
        self.model.save_lora(adapter_path)

        # Push to Hugging Face
        upload_folder(
            repo_id=self.repo_id,
            folder_path=adapter_path,
            repo_type="model",
            path_in_repo=f"lora_step_{step}",  # Subfolder inside repo
            commit_message=f"Checkpoint at step {step}"
        )

        print(f"✅ Uploaded LoRA checkpoint to {self.repo_id}/lora_step_{step}")


In [None]:
from trl import GRPOConfig, GRPOTrainer

training_args = GRPOConfig(
    learning_rate = 5e-6,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 10,

    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,

    num_generations = 2,
    max_prompt_length = 192,
    max_completion_length = 192,

    max_steps = 200,
    save_steps = 50,
    save_total_limit = 2,

    max_grad_norm = 0.3,
    report_to = "none",
    output_dir = "outputs_synth_grpo_fullrun",

    bf16 = False,  # P100 doesn't support this
    fp16 = True,   # Use fp16 for better memory efficiency
)


In [None]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = train_dataset,
)
trainer.train()

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "What is the sqrt of 101?"},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

In [None]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")
model.push_to_hub("etoileboots/gemma-3", token = "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") # Online saving
tokenizer.push_to_hub("etoileboots/gemma-3", token = "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") # Online saving
model.push_to_hub_merged(
        "etoileboots/gemma-3-full-finetune", tokenizer,
        token = "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"
    )

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "etoileboots/gemma-3-full-finetune"  # assuming it's the merged model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

import torch
model.eval()

inputs = tokenizer("What is 12 multiplied by 7?", return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=64)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Simple input
prompt = "What is 12 multiplied by 7?"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=64)
    
# Print response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from datasets import Dataset
import pandas as pd
import torch
import re
from tqdm import tqdm
from transformers import AutoTokenizer

# --- Load synthetic GSM8K test split ---
def load_synthetic_gsm8k(split="test"):
    base_path = "hf://datasets/gretelai/gsm8k-synthetic-diverse-8b/data/"
    file_map = {
        "train": "train-00000-of-00001.parquet",
        "test": "test-00000-of-00001.parquet"
    }
    df = pd.read_parquet(base_path + file_map[split])
    dataset = Dataset.from_pandas(df)
    return dataset

# --- Format prompt with system + user messages ---
def format_prompt(q): 
    return [
        {'role': 'system', 'content': "Respond with step-by-step reasoning and a final answer."},
        {'role': 'user', 'content': q}
    ]

# --- Extract final number from output ---
def extract_number(text):
    match = re.findall(r"[-+]?[0-9]*\.?[0-9]+", text)
    return match[-1] if match else ""

# --- Normalize answers for comparison ---
def normalize(x):
    return re.sub(r"[^a-zA-Z0-9]", "", x.lower().strip())

# --- Load and prepare data ---
test_data = load_synthetic_gsm8k("test").select(range(50))  # 🔁 Adjust range for more data
prompts = [format_prompt(ex["question"]) for ex in test_data]
gold_answers = [ex["answer"].split("####")[-1].strip() for ex in test_data]

# --- Set tokenizer left padding (decoder-only fix) ---
tokenizer.padding_side = "left"

# --- Run inference ---
preds, outputs = [], []
for prompt in tqdm(prompts):
    # 🧠 Step 1: Format the prompt properly
    text = tokenizer.apply_chat_template(prompt, tokenize=False)

    # 🧠 Step 2: Tokenize manually
    tokenized = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
    tokenized = {k: v.to(model.dtype) if v.dtype == torch.float else v for k, v in tokenized.items()}

    # 🧠 Step 3: Generate
    with torch.no_grad():
        out = model.generate(**tokenized, max_new_tokens=128)

    # 🧠 Step 4: Decode
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    outputs.append(decoded)
    preds.append(extract_number(decoded))


# --- Score EM Accuracy ---
scores = [int(normalize(p) == normalize(g)) for p, g in zip(preds, gold_answers)]
accuracy = sum(scores) / len(scores)
print(f"\n🔥 Exact Match Accuracy on Synthetic Test Set: {accuracy:.2%}")

# --- Save Results ---
df = pd.DataFrame({
    "question": [ex["question"] for ex in test_data],
    "gold_answer": gold_answers,
    "predicted_answer": preds,
    "raw_output": outputs,
    "exact_match": scores,
})
df.to_csv("synthetic_test_eval.csv", index=False)
print("✅ Saved evaluation to 'synthetic_test_eval.csv'")