# Imports

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import GRPOConfig, GRPOTrainer
from .grpo import get_training_dataset, optimal_solution_reward_func, improvement_reward_func, valid_response_reward_func, strict_format_reward_func, soft_format_reward_func

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-03 10:26:13 [__init__.py:239] Automatically detected platform cuda.


2025-04-03 10:26:13,668	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


# Load Model

In [None]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-14B-Instruct-bnb-4bit",
    max_seq_length = 4048,        # Can increase for longer reasoning traces
    load_in_4bit = True,          # False for LoRA 16
    fast_inference = True,        # Enable vLLM fast inference
    max_lora_rank = 64,           # Larger rank = smarter, but slower
    gpu_memory_utilization = 0.5, # Reduce if out of memory
    dtype=torch.bfloat16
)

# Apply PEFT
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,                                     # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],                                          # Remove QKVO if out of memory
    lora_alpha = 64,
    use_gradient_checkpointing = "unsloth",     # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.50.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.381 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-14b-instruct-bnb-4bit with actual GPU utilization = 49.42%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.38 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 8.36 GB. Also swap space = 6 GB.
INFO 04-03 10:26:40 [config.py:585] This model supports multiple tasks: {'classify', 'generate', 'score', 'reward', 'embed'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config 

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.16it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.19it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.09it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.13it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.12it/s]


INFO 04-03 10:26:46 [punica_selector.py:18] Using PunicaWrapperGPU.





INFO 04-03 10:26:46 [model_runner.py:1146] Model loading took 9.8082 GB and 4.456978 seconds
INFO 04-03 10:26:48 [worker.py:267] Memory profiling takes 1.44 seconds
INFO 04-03 10:26:48 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.38GiB) x gpu_memory_utilization (0.49) = 19.46GiB
INFO 04-03 10:26:48 [worker.py:267] model weights take 9.81GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.24GiB; the rest of the memory reserved for KV Cache is 8.32GiB.
INFO 04-03 10:26:48 [executor_base.py:111] # cuda blocks: 2840, # CPU blocks: 2048
INFO 04-03 10:26:48 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 22.19x
INFO 04-03 10:26:51 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreas

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:25<00:00,  1.24it/s]

INFO 04-03 10:27:16 [model_runner.py:1570] Graph capturing finished in 25 secs, took 0.94 GiB
INFO 04-03 10:27:16 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 30.24 seconds



Unsloth 2025.3.19 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


# Load Dataset

In [None]:
# Load training dataset
dataset = get_training_dataset("tsp_training_prompt_dataset.json")
print(len(dataset))

# Reward Functions

In [4]:
# Get reward functions
reward_funcs = [
  optimal_solution_reward_func,
  improvement_reward_func,
  valid_response_reward_func,
  strict_format_reward_func,
  soft_format_reward_func
]

# GRPO

In [None]:
# Load GRPO Config
training_args = GRPOConfig(
    use_vllm = True,                 # Use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = True,
    fp16 = False,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8,             # Decrease if out of memory
    max_prompt_length = 2048,
    max_completion_length = 2048,
    num_train_epochs = 1,            # Set to 1 for a full training run
    max_steps = len(dataset),
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "wandb",              # Can use Weights & Biases
    output_dir = "outputs/Qwen2.5-14B-Instruct-GRPO",
    temperature=0.7,
    beta=0.0
)

# Load GRPO Trainer
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = reward_funcs,
    args = training_args,
    train_dataset = dataset,
)

# Train model
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 810 | Num Epochs = 1 | Total steps = 810
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 275,251,200/14,000,000,000 (1.97% trained)
Unsloth: Input IDs of length 2049 > the model's max sequence length of 2048.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / optimal_solution_reward_func,rewards / improvement_reward_func,rewards / valid_response_reward_func,rewards / strict_format_reward_func,rewards / soft_format_reward_func
1,0.0,3.75,3.105295,888.0,0.0,1.25,1.25,0.625,0.3125,0.3125
2,-0.0,4.125,1.552648,815.125,0.0,0.5,1.75,0.875,0.5,0.5
3,0.0,0.0,0.0,565.0,0.020477,0.0,0.0,0.0,0.0,0.0
4,-0.0,1.75,1.908627,941.5,0.919408,0.0,0.75,0.375,0.3125,0.3125
5,0.0,2.875,1.246423,769.125,0.000187,0.0,1.0,0.875,0.5,0.5
6,0.0,0.0,0.0,570.0,0.014254,0.0,0.0,0.0,0.0,0.0
7,0.0,6.0,0.0,597.375,0.000233,2.0,2.0,1.0,0.5,0.5
8,0.0,4.5,0.92582,924.125,0.000228,0.5,2.0,1.0,0.5,0.5
9,-0.0,2.0,2.13809,917.625,0.027948,0.0,1.0,0.5,0.25,0.25
10,0.0,0.0,0.0,388.0,0.013833,0.0,0.0,0.0,0.0,0.0


Unsloth: Will smartly offload gradients to save VRAM!
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/u/jfahy/.conda/envs/unsloth_env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_519205/2628383760.py", line 39, in <module>
    trainer.train()
  File "/u/jfahy/.conda/envs/unsloth_env/lib/python3.10/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "<string>", line 310, in _fast_inner_training_loop
  File "<string>", line 25, in _unsloth_training_step
  File "/scratch/bchk/jfahy/final_project/unsloth_compiled_cache/UnslothGRPOTrainer.py", line 958, in _prepare_inputs
    outputs = self.llm.generate(all_prompts_text, sampling_params=self.sampling_params, use_tqdm=False, lora_request = self.model.load_lora('grpo_trainer_lora_model', load_tensors = True))
  File "/u/jfahy/.conda/envs/unsloth_env/lib/python3.10/site-packages/vllm/utils.py", line 1072, in inner
    return fn(*args, **

# Save Model

In [None]:
model.save_lora("Qwen2.5-14B-Instruct-GRPO")