## Synthetic Data Training


In [1]:
# Kaggle-compatible setup (Unsloth + GRPO + Gemma-3)
!pip install --upgrade pip -q
!pip install unsloth trl peft accelerate bitsandbytes datasets huggingface_hub -q
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3 -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

## Load up Gemma3-1B and Set Parameters

In [2]:
from unsloth import FastModel
import torch
max_seq_length = 1024

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-25 06:51:45.915444: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745563906.318468      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745563906.423602      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Add LORA adaptors to update only a small number of parameters.

In [3]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    r = 8,
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)


Unsloth: Making `model.base_model.model.model` require gradients


In [4]:
from peft import get_peft_model_state_dict

# Check if LoRA weights are being tracked
peft_state = get_peft_model_state_dict(model)
print(f"LoRA parameters being trained: {len(peft_state)}")
model.print_trainable_parameters()


LoRA parameters being trained: 364
trainable params: 6,522,880 || all params: 1,006,408,832 || trainable%: 0.6481


## Data Preparation of Synthetic Data

This data is taken from Hugging Face, created from [Gretel Navigator with meta-llama](https://huggingface.co/datasets/gretelai/gsm8k-synthetic-diverse-8b). It contains ~1500 Training and 300 Test Grade School Math Problems.

In [5]:
! pip install datasets



In [6]:
from datasets import Dataset
import pandas as pd

def load_synthetic_gsm8k(split="train"):
    base_path = "hf://datasets/gretelai/gsm8k-synthetic-diverse-8b/data/"
    file_map = {
        "train": "train-00000-of-00001.parquet",
        "test": "test-00000-of-00001.parquet"
    }
    if split not in file_map:
        raise ValueError("Split must be 'train' or 'test'")

    df = pd.read_parquet(base_path + file_map[split])
    dataset = Dataset.from_pandas(df)
    return dataset

In [7]:
train_set = load_synthetic_gsm8k("train")
test_set = load_synthetic_gsm8k("test")

print(f"Train set size: {len(train_set)}")
print(f"Test set size: {len(test_set)}")

# View a sample
print("\nTrain example:")
print(train_set[0])

print("\nTest example:")
print(test_set[0])


Train set size: 1528
Test set size: 300

Train example:
{'difficulty': 'hard', 'difficulty_description': 'Complex multi-step problems that challenge students with more involved reasoning and multiple operations', 'topic': 'data interpretation', 'topic_description': 'Interpreting data from simple graphs, charts, and tables', 'context': 'family', 'context_description': 'Family activities like sharing chores, planning events, or splitting costs', 'age_group': 'grades 4-5', 'age_group_description': 'Problems suitable for students aged 9-11 (grades 4-5)', 'question': 'Maria and her 9 cousins are helping their aunt and uncle sort and pack boxes for a garage sale. They have 24 boxes to pack, and each box can hold 8 items. If they want to put an equal number of items in each box, how many items can they pack in total?', 'answer': "Let's solve this step by step:\n1. They have 24 boxes to pack, and each box can hold 8 items. Total items = <<24*8=192>>192\n2. Since they want to put an equal numbe

### Check the structure with question & answer

In [8]:
train_set[0]["question"]

'Maria and her 9 cousins are helping their aunt and uncle sort and pack boxes for a garage sale. They have 24 boxes to pack, and each box can hold 8 items. If they want to put an equal number of items in each box, how many items can they pack in total?'

In [9]:
train_set[0]["answer"]

"Let's solve this step by step:\n1. They have 24 boxes to pack, and each box can hold 8 items. Total items = <<24*8=192>>192\n2. Since they want to put an equal number of items in each box, we need to divide the total number of items by the number of boxes. Items per box = <<192/24=8>>8\n3. To find the total number of items they can pack, we multiply the number of items per box by the number of boxes. Total items = <<8*24=192>>192\n#### 192"

### Notice the answer has a ####, so we extract that

In [10]:
def extract_hash_answer(text):
    if "####" not in text: return None
    return text.split("####")[1].strip()
extract_hash_answer(train_set[0]["answer"])

'192'

We now create a system prompt which can be customized. We add 4 extra symbols for working out or thinking / reasoning sections and a final answer:

In [11]:
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""
system_prompt

'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>'

Let's map our synthetic dataset and observe the first row:

In [12]:
# map training dataset
train_dataset = train_set.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]),
})
train_dataset[0]

Map:   0%|          | 0/1528 [00:00<?, ? examples/s]

{'difficulty': 'hard',
 'difficulty_description': 'Complex multi-step problems that challenge students with more involved reasoning and multiple operations',
 'topic': 'data interpretation',
 'topic_description': 'Interpreting data from simple graphs, charts, and tables',
 'context': 'family',
 'context_description': 'Family activities like sharing chores, planning events, or splitting costs',
 'age_group': 'grades 4-5',
 'age_group_description': 'Problems suitable for students aged 9-11 (grades 4-5)',
 'question': 'Maria and her 9 cousins are helping their aunt and uncle sort and pack boxes for a garage sale. They have 24 boxes to pack, and each box can hold 8 items. If they want to put an equal number of items in each box, how many items can they pack in total?',
 'answer': '192',
 '__index_level_0__': 1140,
 'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provi

In [13]:
test_dataset = test_set.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]),
})
test_dataset[0]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

{'difficulty': 'easy',
 'difficulty_description': 'Simple one or two-step problems focusing on basic arithmetic or concepts',
 'topic': 'data interpretation',
 'topic_description': 'Interpreting data from simple graphs, charts, and tables',
 'context': 'school',
 'context_description': 'Classroom scenarios, such as calculating grades, attendance, or supplies',
 'age_group': 'grades 4-5',
 'age_group_description': 'Problems suitable for students aged 9-11 (grades 4-5)',
 'question': "What is the total cost of supplies for Ms. Patel's class if she buys 5 boxes of pencils at $8 each and 2 reams of paper at $2 each?",
 'answer': '44',
 '__index_level_0__': 720,
 'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>',
   'role': 'system'},
  {'content': "What is the total cost of supplies for Ms. Patel's class if she buys 5

We create a regex format to match the reasoning sections and answers:

In [14]:
import re

match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

In [15]:
# verify that it works
match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>

Create a reward function to match the format exactly - we reward it with 3 points if it succeeds:

In [16]:
def match_format_exactly(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

In [17]:
# if it fails, give it partial rewards
def match_format_approximately(completions, **kwargs):
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Count how many keywords are seen - we penalize if too many!
        # If we see 1, then plus some points!
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

Finally, we want to extract the generated answer, and reward or penalize it! We also reward it based on how close the answer is to the true one via ratios:

In [18]:
def check_answer(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # Correct answer gets 3 points!
        if guess == true_answer:
            score += 3.0
        # Match if spaces are seen
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # We also reward it if the answer is close via ratios!
            # Ie if the answer is within some range, reward it!
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                else: score -= 1.0 # Penalize wrong answers
            except:
                score -= 0.5 # Penalize
        scores.append(score)
    return scores

In [19]:
# The answer may not come as a single number, let's account for that:
match_numbers = re.compile(
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)
match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>")

['0.34']

In [20]:
def check_numbers(prompts, completions, answer, **kwargs):
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            guess       = float(guess.strip())
            scores.append(1.5 if guess == true_answer else 0.0)
        except:
            scores.append(0)
            continue
    return scores

<a name="Train"></a>
### Training the model

Now set up GRPO Trainer and all configurations!

In [21]:
from huggingface_hub import upload_folder
from transformers import TrainerCallback
import os
from huggingface_hub import login

# Login first
login(token="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")

In [22]:
class HFPushCallback(TrainerCallback):
    def __init__(self, model, repo_id):
        self.model = model
        self.repo_id = repo_id

    def on_save(self, args, state, control, **kwargs):
        step = state.global_step
        adapter_path = f"checkpoint_lora_step_{step}"
        print(f"💾 Saving LoRA at {adapter_path} and uploading to HF...")

        # Save LoRA adapter
        self.model.save_lora(adapter_path)

        # Push to Hugging Face
        upload_folder(
            repo_id=self.repo_id,
            folder_path=adapter_path,
            repo_type="model",
            path_in_repo=f"lora_step_{step}",  # Subfolder inside repo
            commit_message=f"Checkpoint at step {step}"
        )

        print(f"✅ Uploaded LoRA checkpoint to {self.repo_id}/lora_step_{step}")


In [23]:
from trl import GRPOConfig, GRPOTrainer

training_args = GRPOConfig(
    learning_rate = 5e-6,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 10,

    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,

    num_generations = 2,
    max_prompt_length = 192,
    max_completion_length = 192,

    max_steps = 200,
    save_steps = 50,
    save_total_limit = 2,

    max_grad_norm = 0.3,
    report_to = "none",
    output_dir = "outputs_synth_grpo_fullrun",

    bf16 = False,  # P100 doesn't support this
    fp16 = True,   # Use fp16 for better memory efficiency
)


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 2


In [24]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = train_dataset,
)
trainer.train()

Unsloth: Switching to float32 training since model cannot work with float16


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,528 | Num Epochs = 2 | Total steps = 200
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,522,880/1,006,408,832 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


******************** Question:
Maya and her classmates are going on a camping trip. They have 15 big bags of marshmallows to roast over a campfire. If they eat 3 bags each day for 4 days, how many bags of marshmallows will they have left after the trip? 
Answer:
3 
Response:
<start_working_out>
We need to find out how many bags of marshmallows Maya and her classmates will have left after the trip.
They start with 15 bags of marshmallows.
They eat 3 bags each day for 4 days.
The number of bags eaten in 4 days is 3 bags/day * 4 days = 12 bags.
The number of bags left after 4 days is 15 bags - 12 bags = 3 bags.

<SOLUTION>3
 
Extracted:
3
Unsloth: Will smartly offload gradients to save VRAM!
******************** Question:
Ms. Patel's family is planning a big Diwali celebration and needs to buy decorations. They want to put a border of colorful lights around the entire house. If the house is 30 meters long and the lights come in 5-meter rolls, how many rolls of lights will Ms. Patel's fami

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
10,0.0,-0.025,0.574524,165.275,1e-05,0.0,-0.475,0.0,0.45
20,0.0,0.190625,0.464039,163.10625,0.00013,0.0,-0.38125,0.0,0.571875
30,0.0,-0.115625,0.37565,169.43125,0.000722,0.0,-0.51875,0.0,0.403125
40,0.0001,0.203125,0.517072,162.75625,0.002445,0.01875,-0.36875,0.01875,0.534375
50,0.0002,0.346875,0.587783,160.68125,0.004755,0.01875,-0.3,0.01875,0.609375
60,0.0002,0.171875,0.357973,157.89375,0.006049,0.0,-0.38125,0.0,0.553125
70,0.0004,0.615625,0.508233,154.2625,0.009845,0.01875,-0.21875,0.01875,0.796875
80,0.0004,0.359375,0.464039,155.5125,0.010196,0.0,-0.26875,0.0,0.628125
90,0.0006,0.346875,0.428683,159.05,0.014584,0.0,-0.3,0.0,0.646875
100,0.0007,0.478125,0.472878,153.15,0.018488,0.0,-0.1875,0.0,0.665625


******************** Question:
Maria's family is planning a road trip from their hometown in Chicago to visit their relatives in New York City. They have a map that shows the distance from Chicago to New York City is 790 miles. If they drive for 5 hours on the first day and cover 1/3 of the total distance, how many more hours will it take them to reach New York City if they maintain the same speed? 
Answer:
10 
Response:
<start_working_out>
Let $d$ be the total distance from Chicago to New York City, which is 790 miles.
On the first day, Maria's family drives for 5 hours at a constant speed. The distance covered is $\frac{1}{3}$ of the total distance.
Distance covered on the first day = $\frac{1}{3} \times 790 = \frac{790}{3}$ miles.
Time taken on the first day = 5 hours.
Speed of the family = $\frac{790/3}{5} = \frac{790}{3 \times 5} = \frac{790}{15}$ miles per hour.
The remaining distance to New York City is $790 - \frac{790}{3} = \frac{2370 - 7 
Extracted:
None
******************** 

TrainOutput(global_step=200, training_loss=0.0006829423276576563, metrics={'train_runtime': 14743.7914, 'train_samples_per_second': 0.217, 'train_steps_per_second': 0.014, 'total_flos': 0.0, 'train_loss': 0.0006829423276576563})

In [25]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "What is the sqrt of 101?"},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

<bos><bos><start_of_turn>user
You are given a problem.
Think about the problem and provide your working out.
Place it between <start_working_out> and <end_working_out>.
Then, provide your solution between <SOLUTION></SOLUTION>

What is the sqrt of 101?<end_of_turn>
<start_of_turn>model
<start_working_out>
The square root of 101 is approximately 10.0498756.
<SOLUTION>
10.0498756<end_of_turn>


In [26]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")
model.push_to_hub("etoileboots/gemma-3", token = "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") # Online saving
tokenizer.push_to_hub("etoileboots/gemma-3", token = "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") # Online saving
model.push_to_hub_merged(
        "etoileboots/gemma-3-full-finetune", tokenizer,
        token = "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"
    )

README.md:   0%|          | 0.00/578 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/26.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/etoileboots/gemma-3


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [00:22<00:00, 22.05s/it]


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "etoileboots/gemma-3-full-finetune"  # assuming it's the merged model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

import torch
model.eval()

inputs = tokenizer("What is 12 multiplied by 7?", return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=64)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Simple input
prompt = "What is 12 multiplied by 7?"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=64)
    
# Print response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

2025-04-25 19:44:32.481941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745610272.677627      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745610272.735541      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

What is 12 multiplied by 7?

We can solve this by multiplying 12 by 7.
12 x 7 = (10 + 2) x 7 = 10 x 7 + 2 x 7 = 70 + 14 = 84.

Alternatively, we can write

What is 12 multiplied by 7?

We can multiply 12 by 7 using the standard multiplication method:

12 x 7 = (10 + 2) x 7 = 10 x 7 + 2 x 7 = 70 + 14 = 84

Alternatively, we can use


In [5]:
from datasets import Dataset
import pandas as pd
import torch
import re
from tqdm import tqdm
from transformers import AutoTokenizer

# --- Load synthetic GSM8K test split ---
def load_synthetic_gsm8k(split="test"):
    base_path = "hf://datasets/gretelai/gsm8k-synthetic-diverse-8b/data/"
    file_map = {
        "train": "train-00000-of-00001.parquet",
        "test": "test-00000-of-00001.parquet"
    }
    df = pd.read_parquet(base_path + file_map[split])
    dataset = Dataset.from_pandas(df)
    return dataset

# --- Format prompt with system + user messages ---
def format_prompt(q): 
    return [
        {'role': 'system', 'content': "Respond with step-by-step reasoning and a final answer."},
        {'role': 'user', 'content': q}
    ]

# --- Extract final number from output ---
def extract_number(text):
    match = re.findall(r"[-+]?[0-9]*\.?[0-9]+", text)
    return match[-1] if match else ""

# --- Normalize answers for comparison ---
def normalize(x):
    return re.sub(r"[^a-zA-Z0-9]", "", x.lower().strip())

# --- Load and prepare data ---
test_data = load_synthetic_gsm8k("test").select(range(50))  # 🔁 Adjust range for more data
prompts = [format_prompt(ex["question"]) for ex in test_data]
gold_answers = [ex["answer"].split("####")[-1].strip() for ex in test_data]

# --- Set tokenizer left padding (decoder-only fix) ---
tokenizer.padding_side = "left"

# --- Run inference ---
preds, outputs = [], []
for prompt in tqdm(prompts):
    # 🧠 Step 1: Format the prompt properly
    text = tokenizer.apply_chat_template(prompt, tokenize=False)

    # 🧠 Step 2: Tokenize manually
    tokenized = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
    tokenized = {k: v.to(model.dtype) if v.dtype == torch.float else v for k, v in tokenized.items()}

    # 🧠 Step 3: Generate
    with torch.no_grad():
        out = model.generate(**tokenized, max_new_tokens=128)

    # 🧠 Step 4: Decode
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    outputs.append(decoded)
    preds.append(extract_number(decoded))


# --- Score EM Accuracy ---
scores = [int(normalize(p) == normalize(g)) for p, g in zip(preds, gold_answers)]
accuracy = sum(scores) / len(scores)
print(f"\n🔥 Exact Match Accuracy on Synthetic Test Set: {accuracy:.2%}")

# --- Save Results ---
df = pd.DataFrame({
    "question": [ex["question"] for ex in test_data],
    "gold_answer": gold_answers,
    "predicted_answer": preds,
    "raw_output": outputs,
    "exact_match": scores,
})
df.to_csv("synthetic_test_eval.csv", index=False)
print("✅ Saved evaluation to 'synthetic_test_eval.csv'")

100%|██████████| 50/50 [26:48<00:00, 32.16s/it]


🔥 Exact Match Accuracy on Synthetic Test Set: 22.00%
✅ Saved evaluation to 'synthetic_test_eval.csv'



