In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
# export WANDB_API_KEY="5a69225ea1d050c9c21f67c2db85febf61fa8fb1"

import wandb

wandb.login(key="5a69225ea1d050c9c21f67c2db85febf61fa8fb1")

2025-05-16 12:31:51.057686: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-16 12:31:51.179405: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-16 12:31:51.179437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-16 12:31:51.180735: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-16 12:31:51.189160: I tensorflow/core/platform/cpu_feature_guar

True

In [2]:
dataset = load_dataset("mlabonne/smoltldr")

model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

lora_config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 16, # rank of the surrogate matrices!
    lora_alpha = 32, # scale factor controlling the impact of the modifications
    target_modules = "all-linear" # applied to all linear transformations in the model
)
model = get_peft_model(model, lora_config)

In [15]:
dataset['test']

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 200
})

In [6]:
tokenizer

GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM-135M-Instruct', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<repo_name>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<reponame>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<file_sep>",

In [17]:
ideal_length = 50
def reward_function(completions, **kwargs):
    return [-abs(ideal_length - len(completion)) for completion in completions]

In [18]:
reward_function([dataset['test']['completion'][0]])

[-50]

In [11]:
training_args = GRPOConfig(
    output_dir = "GRPO",
    learning_rate = 2e-5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    max_prompt_length = 512, 
    max_completion_length = 96,
    num_generations = 4, # i.e. number of competitive completions considered during optimization
    optim = "adamw_8bit",
    num_train_epochs = 1,
    bf16 = True, 
    report_to = ["wandb"],
    remove_unused_columns = False, 
    logging_steps = 1
)

In [19]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_function],
    args=training_args,
    train_dataset=dataset["test"],
)

wandb.init(project="GRPO")
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,0.3993
2,0.1197
3,0.157
4,0.1915
5,0.0
6,0.1276
7,0.1669
8,0.2683
9,0.0041
10,0.0156


TrainOutput(global_step=100, training_loss=0.1613956183724804, metrics={'train_runtime': 953.7072, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.105, 'total_flos': 0.0, 'train_loss': 0.1613956183724804})

In [20]:
merged_model = trainer.model.merge_and_unload()

In [21]:
merged_model.push_to_hub(
    "SmolGRPO-135M", private=False, tags=["GRPO", "Reasoning-Course"]
)

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/saracandu/SmolGRPO-135M/commit/57fc93bbb852288354dbf2838bc1bb43e7d0aa57', commit_message='Upload LlamaForCausalLM', commit_description='', oid='57fc93bbb852288354dbf2838bc1bb43e7d0aa57', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saracandu/SmolGRPO-135M', endpoint='https://huggingface.co', repo_type='model', repo_id='saracandu/SmolGRPO-135M'), pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub("SmolGRPO-135M")


CommitInfo(commit_url='https://huggingface.co/saracandu/SmolGRPO-135M/commit/7e04a45579afd6106bfa6a61fe1b2550e4a60cbe', commit_message='Upload tokenizer', commit_description='', oid='7e04a45579afd6106bfa6a61fe1b2550e4a60cbe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/saracandu/SmolGRPO-135M', endpoint='https://huggingface.co', repo_type='model', repo_id='saracandu/SmolGRPO-135M'), pr_revision=None, pr_num=None)