
## 1. Reinforcement Learning on Math Dataset


In [1]:
%%capture
!pip install datasets trl peft bitsandbytes==0.45.0 
!pip install --upgrade pip
!pip install unsloth unsloth_zoo --no-cache-dir --upgrade
!pip install vllm
!pip install --upgrade pillow

### 1.1. Data

In [6]:
from datasets import load_dataset


def extract_hash_answer(text):
  # the final answer comes after ####
  if "####" not in text:
    return None

  return text.split("####")[1].strip()

system_prompt = (
    "You are an expert in solving high school math word problems. "
    "You first think through the reasoning process step-by-step in your mind and then provide the answer."
)

user_prompt= (
  "Solve the following question:\n{question}.\n"
  "Show your work in <think> </think> tags. And return the final answer as an integer in <answer> </answer> tags."
)

def format_prompt(tokenizer,sample):
  prompt=[
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt.format(question=sample["question"])},
      {"role": "assistant", "content": "Let me solve this step by step.\n<think>"}
  ]
  return {
      "prompt": tokenizer.apply_chat_template(prompt,tokenize=False, continue_final_message=True),
      "target": extract_hash_answer(sample["answer"]),
  }

def get_questions(tokenizer):
  """ To train with GRPO, each data sample needs to have fields "prompt" and "answer"
      Given dataset: "question" and "answer"
  """
  ds = load_dataset("openai/gsm8k", "main")
  ds = ds.map(lambda x: format_prompt(tokenizer,x))
  # Optional: remove redundant columns
  # ds = ds.remove_columns(['question', 'answer'])
  return ds

# extract answer from the model to verify correctness
def extract_xml_answer(text):
  if "<answer>" not in text or "</answer>" not in text:
    return None
  answer=text.split("<answer>")[1].strip()    # strip white spaces (\ or \n) both front and back
  answer=answer.split("</answer>")[0].strip()
  return answer



In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

dataset=get_questions(tokenizer)
train_dataset, test_dataset=dataset["train"], dataset["test"]

### 1.2 Reward functions

In [4]:
import re

def format_reward(completions, target, **kwargs):
    """
    Format: <think>...</think><answer>...</answer>
    Args:
        completions (list[str]): Generated outputs
        target (list[str]): Expected answers

    Returns:
        list[float]: Reward scores
    """
    rewards=[]
    for completion,gt_ans in zip(completions,target):
      try: # add synthetic <think> as it is part of the prompt
        completion = "<think>"+completion
        # define regular expression to check if the format is correct
        # allow nested tags like <think><other_tag>...</other_tag></think> but not nested <think> or </think> 
        regex = r"^<think>([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n<answer>([\s\S]*?)<\/answer>$"

        match=re.search(regex,completion,re.DOTALL)
        if match is None or len(match.groups())!=2:
          rewards.append(0.0)
        else: 
          # (Optional) print to monitor model's response
          # print('-'*100, f"\nModel Response:\n{completion}", f"\n#### Answer: {gt_ans}")
          rewards.append(1.0)
      except Exception:
        rewards.append(0.0)

    return rewards


def correctness_reward(completions,target,**kwargs) -> list[float]:
  """
    Args:
        completions (list[str]): Generated outputs
        target (list[str]): Expected answers, each is an integer, but read a string

    Returns:
        list[float]: Reward scores
  """
  rewards=[]
  for completion, gt_ans in zip(completions,target):
    model_ans=extract_xml_answer(completion)
    # Check if model_ans is None (i.e., no <answer> tag found)
    if model_ans is None:
        rewards.append(0.0)  # No answer found, so reward is 0
        continue
    try:
        # Compare the model's answer to the ground truth
        if abs(float(model_ans) - float(gt_ans)) < 1e-5:
            rewards.append(1.0)
        else:
            rewards.append(0.0)
    except Exception: # Handle cases where model_ans or gt_ans cannot be converted to float
        rewards.append(0.0)
  return rewards

### 1.3 Load model with unsloth

In [5]:
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import GRPOConfig, GRPOTrainer
import torch


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = 1024, # Can increase for longer reasoning traces
    load_in_4bit = True,   # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,      # Larger rank = smarter, but slower. Suggested values 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = 32,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)




Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, is_bfloat16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-13 04:14:10 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.39%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 22.17 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 10.75 GB. Al



INFO 03-13 04:14:18 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 04:14:20 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 03-13 04:14:20 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-13 04:14:22 worker.py:267] Memory profiling takes 2.25 seconds
INFO 03-13 04:14:22 worker.py:267] the current vLLM instance can use total_gpu_memory (22.17GiB) x gpu_memory_utilization (0.59) = 13.17GiB
INFO 03-13 04:14:22 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 9.68GiB.
INFO 03-13 04:14:22 executor_base.py:111] # cuda blocks: 17617, # CPU blocks: 10922
INFO 03-13 04:14:22 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 275.27x
INFO 03-13 04:14:28 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:26<00:00,  1.16it/s]

INFO 03-13 04:14:55 model_runner.py:1562] Graph capturing finished in 27 secs, took 4.41 GiB
INFO 03-13 04:14:55 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 35.04 seconds



Unsloth 2025.3.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


### 1.4 Train

In [6]:
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 4,                  # log information every 4 steps
    bf16 = is_bfloat16_supported(),     # determine the data type used for training
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,  # Increase to 4 for smoother training
    num_generations = 2,              # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 256,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 400,
    save_steps = 400,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "grpo_lora",
)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        correctness_reward,
        format_reward,
    ],
    args = training_args,
    train_dataset = train_dataset,
)
trainer.train()

# save model and tokenizer
trainer.model.save_lora(training_args.output_dir)
trainer.tokenizer.save_pretrained(training_args.output_dir)

# delete model, trainer and free cache
del model, trainer
torch.cuda.empty_cache()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 400
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,933,568/1,830,055,936 (1.64% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / correctness_reward,rewards / format_reward
4,0.0,0.75,0.618718,212.5,0.000245,0.34375,0.40625
8,0.0,1.25,0.441942,184.625,0.000461,0.5625,0.6875
12,0.0,1.125,0.53033,202.125,0.00046,0.5625,0.5625
16,0.0,1.15625,0.839689,202.1875,0.000456,0.5,0.65625
20,0.0,1.0625,0.53033,196.5625,0.00048,0.53125,0.53125
24,0.0,1.4375,0.353553,175.125,0.000553,0.53125,0.90625
28,0.0,1.5,0.441942,159.25,0.000518,0.71875,0.78125
32,0.0,1.125,0.53033,183.875,0.000526,0.5,0.625
36,0.0,1.0,0.265165,181.0625,0.000593,0.375,0.625
40,0.0,0.875,0.53033,218.5625,0.000526,0.4375,0.4375


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


### 1.5 Inference

In [2]:
import torch
import unsloth

from transformers import AutoModelForCausalLM, AutoTokenizer
from unsloth import FastLanguageModel, is_bfloat16_supported

# Load the fine-tuned model and tokenizer

def load_model_tokenizer_ft(base_model_id, adapter_path):
  """ base_model_id = id of the based model on HF
      adapter_path = saved lora weights
  """
  # Load the base model
  model, _ = FastLanguageModel.from_pretrained(
      model_name=base_model_id,
      max_seq_length=1024,         # Same as during training
      load_in_4bit=True,           # Load in 4-bit for memory efficiency
      fast_inference=True,         # Enable fast inference
      gpu_memory_utilization=0.8,  # Adjust based on your GPU memory
  )

  # Apply LoRA weights (need to be the same as before)
  model = FastLanguageModel.get_peft_model(
      model,
      r=16,  # Same rank as during training
      target_modules=[
          "q_proj", "k_proj", "v_proj", "o_proj",
          "gate_proj", "up_proj", "down_proj",
      ],
      lora_alpha=32,  # Same alpha as during training
      use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
      random_state=3407,  # Same random state as during training
  )
  # Load the saved LoRA weights
  model.load_adapter(adapter_path,adapter_name="default")
  # Set the model to evaluation mode
  model.eval()

  # Load the tokenizer
  tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  return model, tokenizer


# load model and tokenizer
# base_model_id = "Qwen/Qwen2.5-3B-Instruct"
# adapter_path = "grpo_lora"  # Directory where LoRA weights and tokenizer are saved
# model,tokenizer=load_model_tokenizer_ft(base_model_id, adapter_path)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [10]:
def inference(data, i):
  """ Inference on the i-th sample of the test data"""

  # load model and tokenizer

  # prompt from test data
  prompt=data[i]["prompt"]
  target=data[i]["target"]

  # tokenize the prompt
  input_ids=tokenizer(prompt,return_tensors="pt").input_ids
  input_ids=input_ids.to(model.device)
  input_length=input_ids.shape[1]

  with torch.no_grad():
    output_ids = model.generate(
        input_ids,
        max_length=512,  # Adjust based on your needs
        temperature=0.1,  # Controls randomness (lower = more deterministic)
        top_p=0.9,        # Take the top logits with probs summing up to 0.9
        do_sample=True
    )

  output_ids_generate = output_ids[:, input_length:]
  outputs_text=tokenizer.decode(output_ids_generate[0], skip_special_tokens=True)
  full_text=prompt+outputs_text
  print(full_text)
  print("Target: ",target)


In [14]:
inference(test_dataset, 200)

<|im_start|>system
You are an expert in solving high school math word problems. You first think through the reasoning process step-by-step in your mind and then provide the answer.<|im_end|>
<|im_start|>user
Solve the following question:
Baldur gets water from a well. He gets 5 pails of water every morning and 6 pails of water every afternoon. If each pail contains 5 liters of water, how many liters of water does he get every day?.
Show your work in <think> </think> tags. And return the final answer as an integer in <answer> </answer> tags.<|im_end|>
<|im_start|>assistant
Let me solve this step by step.
<think> Baldur gets 5 pails of water in the morning and 6 pails in the afternoon. Each pail contains 5 liters of water. To find out how many liters of water he gets every day, I need to first calculate the total number of pails he gets in a day, and then multiply that by the number of liters each pail contains. </think>
To find the total number of pails Baldur gets in a day, I add the p

## 2. Further instruction finetuning

We will further finetune the model train with GRPO on a dataset by deepseek.

### 2.1 Dataset

In [1]:

import torch
import unsloth

from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from trl import SFTConfig, SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from unsloth import FastLanguageModel, is_bfloat16_supported

def load_model_tokenizer_ft(base_model_id, adapter_path):
  """ base_model_id = id of the based model on HF
      adapter_path = saved lora weights
  """
  # Load the base model
  model, _ = FastLanguageModel.from_pretrained(
      model_name=base_model_id,
      max_seq_length=1024,         # Same as during training
      load_in_4bit=True,           # Load in 4-bit for memory efficiency
      fast_inference=True,         # Enable fast inference
      gpu_memory_utilization=0.4,  # Adjust based on your GPU memory
  )

  # Apply LoRA weights (need to be the same as before)
  model = FastLanguageModel.get_peft_model(
      model,
      r=16,  # Same rank as during training
      target_modules=[
          "q_proj", "k_proj", "v_proj", "o_proj",
          "gate_proj", "up_proj", "down_proj",
      ],
      lora_alpha=32,  # Same alpha as during training
      use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
      random_state=3407,  # Same random state as during training
  )
  # Load the saved LoRA weights
  model.load_adapter(adapter_path,adapter_name="default")
  # Set the model to evaluation mode
  model.eval()

  # Load the tokenizer
  tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  return model, tokenizer
  
# load model and tokenizer
base_model_id = "Qwen/Qwen2.5-3B-Instruct"
adapter_path = "grpo_lora"  # Directory where LoRA weights and tokenizer are saved
model,tokenizer=load_model_tokenizer_ft(base_model_id, adapter_path)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-13 10:23:51 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.527 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 39.59%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 44.53 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 256.
Unsloth: vLLM's KV Cache can use up to 15.21 GB. 



INFO 03-13 10:24:00 weight_utils.py:254] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 10:24:01 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 03-13 10:24:02 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-13 10:24:04 worker.py:267] Memory profiling takes 2.15 seconds
INFO 03-13 10:24:04 worker.py:267] the current vLLM instance can use total_gpu_memory (44.53GiB) x gpu_memory_utilization (0.40) = 17.63GiB
INFO 03-13 10:24:04 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 1.41GiB; the rest of the memory reserved for KV Cache is 13.92GiB.
INFO 03-13 10:24:04 executor_base.py:111] # cuda blocks: 25346, # CPU blocks: 10922
INFO 03-13 10:24:04 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 396.03x
INFO 03-13 10:24:10 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

INFO 03-13 10:24:43 model_runner.py:1562] Graph capturing finished in 33 secs, took 5.08 GiB
INFO 03-13 10:24:43 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 41.22 seconds



Unsloth 2025.3.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [2]:
system_prompt = (
    "You a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning."
    "You first think through the reasoning process step-by-step in your mind and then provide the user with the answer."
)

user_prompt= (
    "Below is a question that describes the condition of a patient. Write a response that appropriately answers the question."
    "Show your reasoning in <think> </think> tags. And return the final response in <answer> </answer> tags.\n"
    "###Question###:\n{question}\n"
    "###Response###:\n<think>"
)

def format_sample(sample):
    question = sample["Question"]
    cot = sample["Complex_CoT"]
    response = sample["Response"]

    # Format the prompt using the tokenizer's chat template
    formatted_prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(question=question)},
            {"role": "assistant", "content": f"{cot}\n</think>\n<answer>\n{response}\n</answer>\n"},
        ],
        tokenize=False,  # Do not tokenize yet
        continue_final_message=True
    )

    return {"prompt": formatted_prompt} 


def gen_dataset(num_samples=10000):
    """ Return:
            A generator on train data "train_gen"
    """
    dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split = "train",trust_remote_code=True)

    # take 2000 data, shuffle with a fixed seed for reproducibility
    dataset = dataset.shuffle(seed=42).select(range(num_samples))
    dataset=dataset.map(format_sample)

    return dataset


dataset=gen_dataset()
# split train and test data
dataset=dataset.train_test_split(test_size=0.1)
ds_train, ds_test=dataset["train"], dataset["test"]

In [3]:
# sanity check
sample=ds_train[0]
print("Keys: ",sample.keys())
print(sample)

Keys:  dict_keys(['Question', 'Complex_CoT', 'Response', 'prompt'])
{'Question': 'What drug is used specifically for the reversal of cerebral vasospasm and infarct following a subarachnoid hemorrhage?', 'Complex_CoT': "Okay, so let's think about subarachnoid hemorrhage first. It's when there's bleeding around the brain, and I know that it brings a set of complications along with it. One major issue is cerebral vasospasm. This sounds pretty serious because it can actually lead to something worse, like a stroke or permanent brain damage. Keeping that in mind, it makes sense that we need to find a way to stop this vasospasm from happening. \n\nMoving on, what kind of treatment do we have? I remember reading that maintaining blood pressure and hydration is important, but those are more like general support measures. What about medications? They should have something specific for vasospasm. Hmm, there must be drugs that target the blood vessels directly to prevent them from constricting. \n

### 2.2 Load trained model, tokenizer and fine tune further


In [4]:
# Load model and tokenizer (aleady loaded!)
# base_model_id = "Qwen/Qwen2.5-3B-Instruct"
# adapter_path = "grpo_lora"  # Directory where LoRA weights and tokenizer are saved
# model,tokenizer=load_model_tokenizer_ft(base_model_id, adapter_path)

import os

# put model to train mode
model.train()
print(f"{sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters")

training_args=SFTConfig(
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4, # gradient accumulation
    per_device_eval_batch_size=16, # should set to 8 for faster speed!!!
    logging_steps=50,              # log train loss every 50 steps
    evaluation_strategy="steps",   # Evaluate at the end of each epoch
    eval_steps=50,                 # Evaluate every 50 steps    
    num_train_epochs = 2,          # or set max_steps=200
    warmup_ratio=0.03,             # follow QLoRA paper
    learning_rate=3e-5,
    bf16=True,        # train in bf16 for higher precision
    optim="adamw_8bit",
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    output_dir="qwen-grpo-deepseek",
    dataset_text_field="prompt",
    report_to="none",
    seed=42, # for reproducibility
)


trainer = SFTTrainer(
    model=model,
    processing_class =tokenizer,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    args=training_args,
)
os.environ['UNSLOTH_RETURN_LOGITS'] = '1'
trainer.train()

# save model and tokenizer
trainer.model.save_lora(training_args.output_dir)
trainer.tokenizer.save_pretrained(training_args.output_dir)


29,933,568 trainable parameters


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,000 | Num Epochs = 2 | Total steps = 562
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 29,933,568/1,830,055,936 (1.64% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,1.8303,1.377714
100,1.3195,1.29979
150,1.286,1.282762
200,1.2824,1.272671
250,1.2669,1.265567
300,1.2564,1.260582
350,1.2534,1.257197
400,1.2471,1.254936
450,1.2488,1.253613
500,1.2462,1.253012


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('qwen-grpo-deepseek/tokenizer_config.json',
 'qwen-grpo-deepseek/special_tokens_map.json',
 'qwen-grpo-deepseek/vocab.json',
 'qwen-grpo-deepseek/merges.txt',
 'qwen-grpo-deepseek/added_tokens.json',
 'qwen-grpo-deepseek/tokenizer.json')

In [None]:

# delete model, trainer and free cache
del model, trainer
torch.cuda.empty_cache()

### 2.3 Inference

In [8]:
def load_model_tokenizer_ft(base_model_id, adapter_path):
  """ base_model_id = id of the based model on HF
      adapter_path = saved lora weights
  """
  # Load the base model
  model, _ = FastLanguageModel.from_pretrained(
      model_name=base_model_id,
      max_seq_length=1024,         # Same as during training
      load_in_4bit=True,           # Load in 4-bit for memory efficiency
      fast_inference=True,         # Enable fast inference
      gpu_memory_utilization=0.6,  # Adjust based on your GPU memory
  )

  # Apply LoRA weights (need to be the same as before)
  model = FastLanguageModel.get_peft_model(
      model,
      r=16,  # Same rank as during training
      target_modules=[
          "q_proj", "k_proj", "v_proj", "o_proj",
          "gate_proj", "up_proj", "down_proj",
      ],
      lora_alpha=32,  # Same alpha as during training
      use_gradient_checkpointing="unsloth",  # Enable gradient checkpointing if needed
      random_state=3407,  # Same random state as during training
  )
  # Load the saved LoRA weights
  model.load_adapter(adapter_path,adapter_name="default")
  # Set the model to evaluation mode
  model.eval()

  # Load the tokenizer
  tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  return model, tokenizer

# Load fine-tuned model and tokenizer 
base_model_id = "Qwen/Qwen2.5-3B-Instruct"
adapter_path = "qwen-grpo-deepseek"  # Directory where LoRA weights and tokenizer are saved
model,tokenizer=load_model_tokenizer_ft(base_model_id, adapter_path)


==((====))==  Unsloth 2025.3.9: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.527 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 63.13%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 44.53 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 25.69 GB. Also swap space = 6 GB.
INFO 03-13 09:36:38 config.py:549] This model supports multiple tasks: {'classify', 'score', 'embed', 'reward', 'generate'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config usin

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 09:36:40 model_runner.py:1115] Loading model weights took 2.2082 GB
INFO 03-13 09:36:41 worker.py:267] Memory profiling takes 0.59 seconds
INFO 03-13 09:36:41 worker.py:267] the current vLLM instance can use total_gpu_memory (44.53GiB) x gpu_memory_utilization (0.63) = 28.11GiB
INFO 03-13 09:36:41 worker.py:267] model weights take 2.21GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.74GiB; the rest of the memory reserved for KV Cache is 24.17GiB.
INFO 03-13 09:36:41 executor_base.py:111] # cuda blocks: 43991, # CPU blocks: 10922
INFO 03-13 09:36:41 executor_base.py:116] Maximum concurrency for 1024 tokens per request: 687.36x
INFO 03-13 09:36:42 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:41<00:00,  1.03it/s]

INFO 03-13 09:37:23 model_runner.py:1562] Graph capturing finished in 42 secs, took 0.12 GiB
INFO 03-13 09:37:23 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 43.27 seconds





In [15]:
print(ds_test[0]['prompt'])

"<|im_start|>system\nYou a medical expert with advanced knowledge in clinical reasoning, diagonstics, and treatment planning.You first think through the reasoning process step-by-step in your mind and then provide the user with the answer.<|im_end|>\n<|im_start|>user\nBelow is a question that describes the condition of a patient. Write a response that appropriately answers the question.Show your reasoning in <think> </think> tags. And return the final response in <answer> </answer> tags.\n###Question###:\nA 40-year old man presented with acute onset pain and swelling of left great toe. On X-ray, punched out lytic lesion seen on phalanx with sclerotic margins and overhanging bony edges. Diagnosis is:(AIIMS November 2014, November 2013)\nA. Gout\nB. Rheumatoid arthritis\nC. Psoriatic arthritis\nD. Reiter's syndrome\n###Response###:\n<think><|im_end|>\n<|im_start|>assistant\nAlright, let's see what's going on here.\n\nSo, we've got a 40-year-old guy who suddenly starts feeling pain and no

In [7]:
# inference function used earlier
def inference(data, i):
  """ Inference on the i-th sample of the test data"""

  # prompt from test data: separate the answer
  sample=data[i]['prompt']
  question=sample.split("###Question###:")[1].strip()
  question=question.split("###Response###")[0].strip()

  # correct response (for comparison)
  correct_response="<think>"+sample.split("<|im_start|>assistant")[1]

  # add back <|im_start|>assistant for the model to generate
  prompt=sample.split("<|im_start|>assistant")[0]+"<|im_start|>assistant"

  # tokenize the prompt: set add_special_tokens=False because we already did it
  input_ids=tokenizer(prompt,return_tensors="pt", add_special_tokens=False).input_ids
  input_ids=input_ids.to(model.device)
  input_length=input_ids.shape[1]

  with torch.no_grad():
    output_ids = model.generate(
        input_ids,
        max_length=1024,  # Adjust based on your needs
        temperature=0.7,  # Controls randomness (lower = more deterministic)
        top_p=0.9,        # Take the top logits with probs summing up to 0.9
        do_sample=True
    )

  output_ids_generate = output_ids[:, input_length:]
  outputs_text=tokenizer.decode(output_ids_generate[0], skip_special_tokens=True)

  # prepend <think> token
  model_response="<think>"+outputs_text

  print("Question:\n", question)
  print("-"*100)
  print("Model response:\n", model_response)
  print("-"*100)
  print("Target response:\n", correct_response)
  print("-"*100)



inference(ds_test, 0)

Question:
 A 60-year-old man with a 1-year history of recurrent aspiration pneumonia, unconscious and gasping for air, dies despite resuscitative efforts. His autopsy shows degeneration of the corticospinal tracts, anterior horn cells of the upper cervical cord, and asymmetrical atrophy of limb muscles, diaphragm, and intercostal muscles. Based on this clinical presentation and autopsy findings, which drug would have most likely slowed the progression of his neurodegenerative disease?
----------------------------------------------------------------------------------------------------
Model response:
 <think>
Okay, so we have this 60-year-old guy who's been dealing with a lot of trouble with his lungs lately. He's been having these episodes of aspiration pneumonia, and he's been unconscious and gasping for air. Despite trying to help him, he died. Hmm, that's really sad. So, let's think about what's going on here. He has this degeneration in the corticospinal tracts, which are those pat