### Import our useful libraries

In [1]:
# Load necessary libraries for GRPO
import unsloth

from trl import GRPOConfig, GRPOTrainer
from transformers import TrainingArguments,GenerationConfig
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel, FastModel

import torch

from evaluate import load
from tqdm import tqdm
import json
import yaml

print("GRPO libraries loaded successfully!")

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
GRPO libraries loaded successfully!


### Define useful functions

In [2]:
# Define some useful functions

def load_secrets(path="secrets.yaml"):
    with open(path, "r") as f:
        return yaml.safe_load(f)


### Let's define some model that we want to test!
Keep an eye on newer models that might be interesting to try! hehe

In [3]:
# Those can be specified if we use the transformers library

model_names_transformers = {
    "gemma-2": "google/gemma-2-2b-it",
    "gemma-3-270m": "google/gemma-3-270m-it",
    "gemma-3-1B": "google/gemma-3-1b-it",
    "qwen": "Qwen/Qwen3-0.6B",
    "falcon": "tiiuae/Falcon-H1-1B-Base", # mmh dunno if you should try it!
}

model_names_unsloth = {
    "gemma-2": "unsloth/gemma-2-2b-it",
    "gemma-3-270m": "unsloth/gemma-3-270m-it",
    "gemma-3-1B": "unsloth/gemma-3-1b-it",
    "qwen": "unsloth/Qwen3-0.6B",
    "falcon": "unsloth/Falcon-H1-0.5B-Instruct",
}

In [4]:
try:
    secrets = load_secrets()
    hf_token = secrets["HF_TOKEN"]
    print("Your token has been correctly loaded!")

except [KeyError, FileNotFoundError]:
    print("Error: Have you created a secrets.yaml file yet??")

Your token has been correctly loaded!


## Let's use unsloth library to load our model!

In [63]:
MAX_SEQ_LENGTH = 2048
SEED = 42

We load it the model with quantization

In [None]:
model, tokenizer = FastModel.from_pretrained(
    model_name = model_names_unsloth["gemma-3-270m"],
    max_seq_length = MAX_SEQ_LENGTH, # Choose any for long context!
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = True, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

==((====))==  Unsloth 2025.10.1: Fast Gemma3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.591 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


We prepare the model for LoRa fine-tuning

In [None]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = SEED,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [45]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma-3",
)

In [46]:
messages = [
    {
        "role" :  "user",   # <--- user query
        "content" : "You are playing a game against another user A. You must choose either action1 or action2. Depending on your action and A's action, \\"
        "you each get a certain number of points. The goal is to score the highest number of points. The points are awarded as follows: \n \\"
        "If you play action1 and A plays action1, you get 3 and A gets 3. \\"
        "If you play action1 and A plays action2, you get 0 and A gets 5. \\"
        "If you play action2 and A plays action1, you get 5 and A gets 0. \\"
        "If you play action2 and A plays action2, you get 1 and A gets 1. \n \\"
        "You have played with this opponent before. Last time, you played action1 and A played action2, so you got 0 points and A got 5 points. "
        "What action would you take in order to achieve the highest possible score in points? \\"
        "Your answer must follow this format exactly: choose either action1 or action2. Do not explain your reasoning. \\"
        "Your answer:"
        # "content" : prompt
    }
]

In [62]:
inputs = tokenizer.apply_chat_template(messages,          # or this sequence if you want inputs to have the attention mask
                                       tokenize=False,
                                       add_generation_prompt=True
                                       )

inputs = tokenizer(inputs, return_tensors="pt").to(model.device)

input_ids, attention_mask = inputs.input_ids, inputs.attention_mask

output_ids = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=256,
    pad_token_id=tokenizer.pad_token_id
)
output = tokenizer.decode(output_ids[0][input_ids.shape[-1]:],
                          skip_special_tokens=True)

# print("\nPROMPT:", prompt)
print("\nOUTPUT:", output)


OUTPUT: Choose action1.



In [64]:
MAX_PROMPT_LENGTH = 256

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = MAX_PROMPT_LENGTH,
    max_completion_length = MAX_SEQ_LENGTH - MAX_PROMPT_LENGTH,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 50,
    save_steps = 50,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


In [75]:
trainer = GRPOTrainer(
    model = model,
    reward_funcs=lambda x:x,
    processing_class = tokenizer,
    args = training_args,
)

In [99]:
def get_output(trainer):

    inputs = tokenizer.apply_chat_template(messages,          # or this sequence if you want inputs to have the attention mask
                                       tokenize=False,
                                       add_generation_prompt=True
                                       )
    
    raw_inputs = inputs

    inputs = tokenizer(inputs, return_tensors="pt").to(model.device)

    input_ids, attention_mask = inputs.input_ids, inputs.attention_mask

    output_ids = trainer.model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        pad_token_id=tokenizer.pad_token_id
    )
    output = tokenizer.decode(output_ids[0][input_ids.shape[-1]:],
                            skip_special_tokens=True)
    
    return raw_inputs, output

def get_reward(output):
    return -1

In [109]:
for episode in range(1):
    batch_prompt = []
    batch_output = []
    batch_reward = []
    for t in range(5):

        prompt, output = get_output(trainer)
        batch_prompt.append(prompt)
        batch_output.append(output)

        reward = get_reward(output)
        batch_reward.append(reward)
    
    batch = {
        "prompt" : batch_prompt,
        "completion" : batch_output,
        "reward" : batch_reward
    }

    trainer.step(batch)



AttributeError: 'UnslothGRPOTrainer' object has no attribute 'step'

# Unlike PPO, GRPO doesn't have the step method implemented! this might be a problem for us!