In [1]:
!pip install -q trl math_verify evaluate vllm==0.8.2 unsloth

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!cp -r "/content/drive/MyDrive/Graduate Project/MATH_train_staging.csv" .
!cp -r "/content/drive/MyDrive/Graduate Project/MATH_val_staging.csv" .
# !cp -r "/content/drive/MyDrive/Graduate Project/Llama-3.2-3B-Instruct-CoD-Reasoning" ./
# !cp -r "/content/drive/MyDrive/Graduate Project/Llama-3.2-3B-Instruct-CoD" ./

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from IPython.display import display, Markdown
from trl import GRPOConfig, GRPOTrainer
from math_verify import parse, verify
from datasets import Dataset
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import evaluate
import random
import torch
import re

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-29 03:20:21 [__init__.py:239] Automatically detected platform cuda.


In [5]:
SEED = 42
random.seed(SEED)
_ = torch.manual_seed(SEED)
DEVICE = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
MAX_NEW_TOKENS = 10000
MAX_SEQ_LEN = 512
LORA_RANK = 16
NUM_EPOCHS = 5
BLEU = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [6]:
train_data = pd.read_csv("MATH_train_staging.csv")
val_data = pd.read_csv("MATH_val_staging.csv")

In [7]:
train_data["extracted_answer"] = train_data["reasoning"].map(lambda x: parse(x)[1])
val_data["extracted_answer"] = val_data["reasoning"].map(lambda x: parse(x)[1])
for split_str in [
    "-OR-",
    " OR ",
    "OR\n",
    "\\$\\$OR \\$\\$",
    "\\\\text\\{OR\\}",
    "\\\\textbf\\{OR\\}",
    "\\\\centerline\\{\\\\bf \\{OR\\}\\}",
    "\\\\centerline\\{\\{\\\\bf OR\\}\\}",
]:
    train_data.loc[
        train_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = train_data[train_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
    val_data.loc[
        val_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = val_data[val_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
train_data.loc[train_data["reasons"].isna(), "reasons"] = train_data.loc[
    train_data["reasons"].isna()
]["reasoning"].str.split("-OR-")
val_data.loc[val_data["reasons"].isna(), "reasons"] = val_data.loc[
    val_data["reasons"].isna()
]["reasoning"].str.split("-OR-")

In [8]:
display(Markdown("Question -> " + train_data.loc[0]["question_text"]))
display(Markdown("Reasoning -> " + train_data.loc[0]["reasons"][0]))
display(Markdown("Answer -> $" + train_data.loc[0]["extracted_answer"] + "$"))

Question -> Find the equation whose graph is a parabola with vertex $(2,4)$, vertical axis of symmetry, and contains the point $(1,1)$. Express your answer in the form "$ax^2+bx+c$".

Reasoning -> Since the axis of symmetry is vertical and the vertex is $(2,4)$, the parabola may also be written as  \[y=a(x-2)^2+4\] for some value of $a$.  Plugging the point $(1,1)$ into this expression gives  \[1=a(1-2)^2+4=a+4.\] This tells us $a=-3$.

Our equation is  \[y=-3(x-2)^2+4.\] Putting it $y=ax^2+bx+c$ form requires expanding the square, so we get  \[y=-3(x^2-4x+4)+4=\boxed{-3x^2+12x-8}.\]

Answer -> $-3x^2+12x-8$

In [9]:
train_dataset = Dataset.from_pandas(train_data[["dataset_id", "question_id", "question_text", "reasons", "extracted_answer"]])
val_dataset = Dataset.from_pandas(val_data[["dataset_id", "question_id", "question_text", "reasons", "extracted_answer"]])

In [10]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. "
    "The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. "
    "The Assistant thinks and reasons about the question in a step-by-step manner but only keeps a minimum draft for each thinking step, "
    "with 5 words at most, focussing on essential calculations/transformations and ensures the steps follow a logical progression. "
    "If the question asks for a number or expression, the Assistant highlights it with the boxed latex command inside the answer. "
    "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> step-by-step draft of reasoning process here </think><answer> answer with all the steps here </answer>"
)

In [11]:
def make_conversation(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": example["question_text"]},
        ]
    }

In [12]:
train_dataset = train_dataset.map(make_conversation).remove_columns(["question_text"]).filter(lambda x: x["question_id"] < 1739)
val_dataset = val_dataset.map(make_conversation).remove_columns(["question_text"]).filter(lambda x: x["question_id"] < 1739)

Map:   0%|          | 0/5988 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5988 [00:00<?, ? examples/s]

Map:   0%|          | 0/1497 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1497 [00:00<?, ? examples/s]

In [13]:
llm, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = 4*MAX_SEQ_LEN,
    load_in_4bit = True,
    fast_inference = True,
    max_lora_rank = LORA_RANK,
    gpu_memory_utilization = 0.7,
    random_state = SEED,
)
model = FastLanguageModel.get_peft_model(
    llm,
    r = LORA_RANK,
    target_modules = ["gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_RANK,
    use_gradient_checkpointing = "unsloth",
    random_state = SEED,
)
model.print_trainable_parameters()

==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 69.2%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 24.95 GB. Also swap space = 6 GB.
INFO 04-29 03:21:02 [config.py:585] This model supports multiple tasks: {'score', 'reward', 'embed', 'classify', 'generate'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

INFO 04-29 03:21:09 [cuda.py:291] Using Flash Attention backend.
INFO 04-29 03:21:09 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 04-29 03:21:09 [model_runner.py:1110] Starting to load model unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit...
INFO 04-29 03:21:10 [loader.py:1155] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 04-29 03:21:10 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

INFO 04-29 03:21:17 [weight_utils.py:281] Time spent downloading weights for unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit: 7.208235 seconds
INFO 04-29 03:21:17 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-29 03:21:19 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-29 03:21:20 [model_runner.py:1146] Model loading took 2.2968 GB and 9.952331 seconds
INFO 04-29 03:21:28 [worker.py:267] Memory profiling takes 7.54 seconds
INFO 04-29 03:21:28 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.69) = 27.37GiB
INFO 04-29 03:21:28 [worker.py:267] model weights take 2.30GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.49GiB; the rest of the memory reserved for KV Cache is 23.50GiB.
INFO 04-29 03:21:28 [executor_base.py:111] # cuda blocks: 13748, # CPU blocks: 3510
INFO 04-29 03:21:28 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 107.41x
INFO 04-29 03:21:32 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. I

Capturing CUDA graph shapes: 100%|██████████| 43/43 [01:11<00:00,  1.66s/it]

INFO 04-29 03:22:43 [model_runner.py:1570] Graph capturing finished in 72 secs, took 0.70 GiB
INFO 04-29 03:22:43 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 83.55 seconds





tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.4.1 patched 28 layers with 0 QKV layers, 0 O layers and 28 MLP layers.


trainable params: 15,138,816 || all params: 3,227,888,640 || trainable%: 0.4690


In [14]:
def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    return [1.0 if match else 0.0 for match in matches]

In [15]:
def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the answer in completion has the ground truth and is equal."""
    solutions = kwargs["extracted_answer"]
    pattern = r"^<think>(.*?)</think>\s*<answer>(.*?)</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, solution in zip(completion_contents, solutions):
        gold_parsed = solution
        matches = re.search(pattern, content)
        if matches:
            if len(matches.groups()) == 2:
                answer_parsed = parse(matches.group(2))
            else:
                answer_parsed = None
        else:
            answer_parsed = None
        if len(gold_parsed) != 0:
            try:
                if answer_parsed is None:
                    rewards.append(0.0)
                else:
                    rewards.append(float(verify(answer_parsed, gold_parsed)))
            except Exception:
                rewards.append(0.0)
        else:
            rewards.append(1.0)
    return rewards

In [16]:
def brevity_reward(completions, **kwargs):
    """Reward function that ensures the reasoning is much smaller than ground truth reasoning."""
    reasons = kwargs["reasons"]
    pattern = r"^<think>(.*?)</think>\s*<answer>(.*?)</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, reason in zip(completion_contents, reasons):
        matches = re.search(pattern, content)
        if matches:
            if len(matches.groups()) >= 1:
                reason_parsed = matches.group(1)
            else:
                reason_parsed = None
        else:
            reason_parsed = None
        if reason_parsed is None:
            rewards.append(0.0)
        else:
            try:
                reward = 0
                for rsn in reason:
                    reward += 1 - (len(reason_parsed) / len(rsn))
                reward /= len(reason)
                rewards.append(reward)
            except Exception:
                rewards.append(0.0)
    return rewards

In [17]:
def similarity_reward(completions, **kwargs):
    """Reward function that ensures the reasoning is similar to the ground truth."""
    reasons = kwargs["reasons"]
    pattern = r"^<think>(.*?)</think>\s*<answer>(.*?)</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, reason in zip(completion_contents, reasons):
        matches = re.search(pattern, content)
        if matches:
            if len(matches.groups()) >= 1:
                reason_parsed = matches.group(1)
            else:
                reason_parsed = None
        else:
            reason_parsed = None
        if reason_parsed is None:
            rewards.append(0.0)
        else:
            try:
                bleu = BLEU.compute(predictions=[reason_parsed], references=[reason])
                rewards.append(bleu["bleu"])
            except Exception:
                rewards.append(0.0)
    return rewards

In [18]:
training_args = GRPOConfig(
    use_vllm=True,
    output_dir=f"/content/drive/MyDrive/Graduate Project/{MODEL_NAME.split('/')[1]}-CoD",
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    remove_unused_columns=False,
    gradient_accumulation_steps=4,
    num_train_epochs=NUM_EPOCHS,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_checkpointing=True,
    max_completion_length=MAX_NEW_TOKENS,
    num_generations=8,
    max_prompt_length=4*MAX_SEQ_LEN,
    max_grad_norm = 0.1,
    report_to=["tensorboard"],
    logging_steps=10,
    push_to_hub=False,
    save_strategy="steps",
    save_steps=10,
)

In [19]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[format_reward, accuracy_reward, brevity_reward, similarity_reward],
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer
)

In [20]:
trainer.train(resume_from_checkpoint=True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,363 | Num Epochs = 5 | Total steps = 1,700
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 15,138,816/3,000,000,000 (0.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / format_reward,rewards / accuracy_reward,rewards / brevity_reward,rewards / similarity_reward
1410,0.011,1.34616,0.835208,159.95,0.275946,0.709375,0.0875,0.544888,0.004397
1420,0.0116,1.331144,0.878516,142.88125,0.290924,0.703125,0.090625,0.53061,0.006785
1430,0.0147,1.323853,0.932046,132.046875,0.366616,0.684375,0.084375,0.554593,0.000511
1440,0.0132,1.362391,0.903481,146.509375,0.331055,0.703125,0.059375,0.597385,0.002507
1450,0.0117,1.345593,0.9383,161.93125,0.291413,0.675,0.11875,0.549353,0.00249
1460,0.0123,1.314942,0.954162,147.69375,0.306835,0.69375,0.071875,0.545264,0.004053
1470,0.0158,1.385399,0.901166,132.20625,0.396235,0.709375,0.115625,0.557141,0.003258
1480,0.0123,1.311108,0.841816,146.74375,0.308059,0.684375,0.078125,0.545211,0.003397
1490,0.0169,1.399195,0.905779,131.371875,0.421322,0.721875,0.103125,0.572314,0.001881
1500,0.0122,1.209624,0.884362,175.009375,0.305445,0.634375,0.04375,0.528446,0.003054


TrainOutput(global_step=1700, training_loss=0.0023262363716083415, metrics={'train_runtime': 9253.5971, 'train_samples_per_second': 0.736, 'train_steps_per_second': 0.184, 'total_flos': 0.0, 'train_loss': 0.0023262363716083415})

In [21]:
trainer.save_model(f"/content/drive/MyDrive/Graduate Project/{MODEL_NAME.split('/')[1]}-CoD-Reasoning")

In [22]:
# !cp -r Llama-3.2-3B-Instruct-CoD-Reasoning "/content/drive/MyDrive/Graduate Project/"
# !cp -r Llama-3.2-3B-Instruct-CoD "/content/drive/MyDrive/Graduate Project/"