In [1]:
!pip install -q trl math_verify evaluate vllm==0.8.2 unsloth

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r "/content/drive/MyDrive/COT/MATH_train_staging.csv" .
!cp -r "/content/drive/MyDrive/COT/MATH_val_staging.csv" .

# !cp -r "/content/drive/MyDrive/Graduate Project/Llama-3.2-3B-Instruct-Reasoning" ./
# !cp -r "/content/drive/MyDrive/Graduate Project/Llama-3.2-3B-Instruct" ./


In [None]:
from huggingface_hub import login
hf_token = input("Please enter your Hugging Face API Key:")
login(token=hf_token)

In [4]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from IPython.display import display, Markdown
from trl import GRPOConfig, GRPOTrainer
from math_verify import parse, verify
from datasets import Dataset
from pprint import pprint
from tqdm import tqdm

import pandas as pd
import evaluate
import random
import torch
import re

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-06 18:26:52 [__init__.py:239] Automatically detected platform cuda.


In [None]:
SEED = 42
random.seed(SEED)
_ = torch.manual_seed(SEED)
DEVICE = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else (
        torch.device("mps")
        if torch.backends.mps.is_available()
        else torch.device("cpu")
    )
)
MODEL_NAME = "Eshita-ds/Llama-3.2-1B-DPO"
MAX_NEW_TOKENS = 10000
MAX_SEQ_LEN = 512
LORA_RANK = 16
NUM_EPOCHS = 2
BLEU = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [None]:
train_data_original = pd.read_csv("MATH_train_staging.csv")
val_data_original = pd.read_csv("MATH_val_staging.csv")

train_data = train_data_original.sample(frac=0.1, random_state=42)
val_data = val_data_original.sample(frac=0.1, random_state=42)

In [7]:
train_data["extracted_answer"] = train_data["reasoning"].map(lambda x: parse(x)[1])
val_data["extracted_answer"] = val_data["reasoning"].map(lambda x: parse(x)[1])
for split_str in [
    "-OR-",
    " OR ",
    "OR\n",
    "\\$\\$OR \\$\\$",
    "\\\\text\\{OR\\}",
    "\\\\textbf\\{OR\\}",
    "\\\\centerline\\{\\\\bf \\{OR\\}\\}",
    "\\\\centerline\\{\\{\\\\bf OR\\}\\}",
]:
    train_data.loc[
        train_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = train_data[train_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
    val_data.loc[
        val_data["reasoning"].str.split(split_str).str.len() > 1, "reasons"
    ] = val_data[val_data["reasoning"].str.split(split_str).str.len() > 1][
        "reasoning"
    ].str.split(
        split_str
    )
train_data.loc[train_data["reasons"].isna(), "reasons"] = train_data.loc[
    train_data["reasons"].isna()
]["reasoning"].str.split("-OR-")
val_data.loc[val_data["reasons"].isna(), "reasons"] = val_data.loc[
    val_data["reasons"].isna()
]["reasoning"].str.split("-OR-")

In [8]:
display(Markdown("Question -> " + train_data.loc[0]["question_text"]))
display(Markdown("Reasoning -> " + train_data.loc[0]["reasons"][0]))
display(Markdown("Answer -> $" + train_data.loc[0]["extracted_answer"] + "$"))

Question -> Find the equation whose graph is a parabola with vertex $(2,4)$, vertical axis of symmetry, and contains the point $(1,1)$. Express your answer in the form "$ax^2+bx+c$".

Reasoning -> Since the axis of symmetry is vertical and the vertex is $(2,4)$, the parabola may also be written as  \[y=a(x-2)^2+4\] for some value of $a$.  Plugging the point $(1,1)$ into this expression gives  \[1=a(1-2)^2+4=a+4.\] This tells us $a=-3$.

Our equation is  \[y=-3(x-2)^2+4.\] Putting it $y=ax^2+bx+c$ form requires expanding the square, so we get  \[y=-3(x^2-4x+4)+4=\boxed{-3x^2+12x-8}.\]

Answer -> $-3x^2+12x-8$

In [9]:
train_dataset = Dataset.from_pandas(train_data[["dataset_id", "question_id", "question_text", "reasons", "extracted_answer"]])
val_dataset = Dataset.from_pandas(val_data[["dataset_id", "question_id", "question_text", "reasons", "extracted_answer"]])

In [10]:
SYSTEM_PROMPT = (
    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
    "<think> reasoning process here </think><answer> answer here </answer>"
)

In [11]:
def make_conversation(example):
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": example["question_text"]},
        ]
    }

In [12]:
train_dataset = train_dataset.map(make_conversation).remove_columns(["question_text"])
val_dataset = val_dataset.map(make_conversation).remove_columns(["question_text"])

Map:   0%|          | 0/5988 [00:00<?, ? examples/s]

Map:   0%|          | 0/1497 [00:00<?, ? examples/s]

In [13]:
llm, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = 4*MAX_SEQ_LEN,
    load_in_4bit = True,
    fast_inference = True,
    max_lora_rank = LORA_RANK,
    gpu_memory_utilization = 0.7,
    random_state = SEED,
)
model = FastLanguageModel.get_peft_model(
    llm,
    r = LORA_RANK,
    target_modules = ["gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_RANK,
    use_gradient_checkpointing = "unsloth",
    random_state = SEED,
)
model.print_trainable_parameters()

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 69.2%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 24.95 GB. Also swap space = 6 GB.
INFO 04-06 18:27:46 [config.py:585] This model supports multiple tasks: {'score', 'classify', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbyte

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

INFO 04-06 18:27:52 [cuda.py:291] Using Flash Attention backend.
INFO 04-06 18:27:52 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 04-06 18:27:52 [model_runner.py:1110] Starting to load model unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit...
INFO 04-06 18:27:53 [loader.py:1155] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 04-06 18:27:55 [weight_utils.py:265] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

INFO 04-06 18:28:02 [weight_utils.py:281] Time spent downloading weights for unsloth/llama-3.2-3b-instruct-unsloth-bnb-4bit: 6.663931 seconds
INFO 04-06 18:28:02 [weight_utils.py:315] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-06 18:28:04 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-06 18:28:05 [model_runner.py:1146] Model loading took 2.2968 GB and 11.849288 seconds
INFO 04-06 18:28:13 [worker.py:267] Memory profiling takes 7.95 seconds
INFO 04-06 18:28:13 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.69) = 27.37GiB
INFO 04-06 18:28:13 [worker.py:267] model weights take 2.30GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.49GiB; the rest of the memory reserved for KV Cache is 23.50GiB.
INFO 04-06 18:28:14 [executor_base.py:111] # cuda blocks: 13748, # CPU blocks: 3510
INFO 04-06 18:28:14 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 107.41x
INFO 04-06 18:28:18 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 43/43 [01:13<00:00,  1.70s/it]

INFO 04-06 18:29:31 [model_runner.py:1570] Graph capturing finished in 73 secs, took 0.70 GiB
INFO 04-06 18:29:31 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 86.23 seconds





tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.19 patched 28 layers with 0 QKV layers, 0 O layers and 28 MLP layers.


trainable params: 15,138,816 || all params: 3,227,888,640 || trainable%: 0.4690


In [14]:
def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    return [1.0 if match else 0.0 for match in matches]

In [15]:
def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the answer in completion has the ground truth and is equal."""
    solutions = kwargs["extracted_answer"]
    pattern = r"^<think>(.*?)</think>\s*<answer>(.*?)</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, solution in zip(completion_contents, solutions):
        gold_parsed = solution
        matches = re.search(pattern, content)
        if matches:
            if len(matches.groups()) == 2:
                answer_parsed = parse(matches.group(2))
            else:
                answer_parsed = None
        else:
            answer_parsed = None
        if len(gold_parsed) != 0:
            try:
                if answer_parsed is None:
                    rewards.append(0.0)
                else:
                    rewards.append(float(verify(answer_parsed, gold_parsed)))
            except Exception:
                rewards.append(0.0)
        else:
            rewards.append(1.0)
    return rewards

In [16]:
def similarity_reward(completions, **kwargs):
    """Reward function that ensures the answer is similar to the ground truth."""
    reasons = kwargs["reasons"]
    pattern = r"^<think>(.*?)</think>\s*<answer>(.*?)</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    rewards = []
    for content, reason in zip(completion_contents, reasons):
        matches = re.search(pattern, content)
        if matches:
            if len(matches.groups()) == 2:
                answer_parsed = parse(matches.group(2))
            else:
                answer_parsed = None
        else:
            answer_parsed = None
        if answer_parsed is None:
            rewards.append(0.0)
        else:
            try:
                bleu = BLEU.compute(predictions=[content], references=[reason])
                rewards.append(bleu["bleu"])
            except Exception:
                rewards.append(0.0)
    return rewards

In [17]:
training_args = GRPOConfig(
    use_vllm=True,
    output_dir=MODEL_NAME.split("/")[1],
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    remove_unused_columns=False,
    gradient_accumulation_steps=4,
    num_train_epochs=NUM_EPOCHS,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_checkpointing=True,
    max_completion_length=MAX_NEW_TOKENS,
    num_generations=8,
    max_prompt_length=4*MAX_SEQ_LEN,
    max_grad_norm = 0.1,
    report_to=["tensorboard"],
    logging_steps=10,
    push_to_hub=False,
    save_strategy="steps",
    save_steps=10,
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 4 to the `num_generations` of 8


In [18]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[format_reward, accuracy_reward, similarity_reward],
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer
)

In [19]:
trainer.train(resume_from_checkpoint=True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,988 | Num Epochs = 2 | Total steps = 2,994
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 15,138,816/3,000,000,000 (0.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / format_reward,rewards / accuracy_reward,rewards / similarity_reward
280,0.0009,0.428751,0.581768,355.9875,0.023371,0.303125,0.096875,0.028751
290,0.0009,0.439368,0.580194,413.94375,0.021318,0.309375,0.09375,0.036243
300,0.0009,0.598927,0.668555,339.215625,0.022324,0.428125,0.128125,0.042677
310,0.0014,0.733823,0.720549,310.415625,0.034416,0.5125,0.16875,0.052573
320,0.0014,0.730508,0.752642,340.259375,0.035049,0.490625,0.175,0.064883
330,0.0014,0.715938,0.695526,356.271875,0.034786,0.475,0.196875,0.044063
340,0.0013,0.785564,0.626288,361.225,0.032541,0.56875,0.153125,0.063689
350,0.0013,0.775679,0.694414,379.140625,0.033735,0.55,0.153125,0.072554
360,0.0013,0.761212,0.689308,342.565625,0.032527,0.546875,0.15,0.064337
370,0.0013,0.864178,0.647814,302.4625,0.033142,0.64375,0.159375,0.061053


KeyboardInterrupt: 

In [20]:
trainer.save_model(MODEL_NAME.split("/")[1] + "-Reasoning")

In [21]:
!cp -r Llama-3.2-3B-Instruct-Reasoning "/content/drive/MyDrive/Graduate Project/"
!cp -r Llama-3.2-3B-Instruct "/content/drive/MyDrive/Graduate Project/"