In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
# export WANDB_API_KEY="5a69225ea1d050c9c21f67c2db85febf61fa8fb1"

import wandb

wandb.login(key="5a69225ea1d050c9c21f67c2db85febf61fa8fb1")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /u/dssc/scandu00/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaracandussio[0m ([33msaracandussio-universit-degli-studi-di-trieste[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
dataset = load_dataset("mlabonne/smoltldr")

model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

lora_config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 16, # rank of the surrogate matrices!
    lora_alpha = 32, # scale factor controlling the impact of the modifications
    target_modules = "all-linear" # applied to all linear transformations in the model
)
model = get_peft_model(model, lora_config)

In [6]:
tokenizer

GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM-135M-Instruct', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<repo_name>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<reponame>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<file_sep>",

In [3]:
ideal_length = 50
def reward_function(completions, **kwargs):
  return [-abs(ideal_length - len(completion)) for completion in completions]

In [4]:
training_args = GRPOConfig(
    output_dir = "GRPO",
    learning_rate = 2e-5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    max_prompt_length = 512, 
    max_completion_length = 96,
    num_generations = 8, # i.e. number of competitive completions considered during optimization
    optim = "adamw_8bit",
    num_train_epochs = 1,
    bf16 = True, 
    report_to = ["wandb"],
    remove_unused_columns = False, 
    logging_steps = 1
)

In [5]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_function],
    args=training_args,
    train_dataset=dataset["test"],
)

wandb.init(project="GRPO")
trainer.train()



Step,Training Loss
1,-0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


KeyboardInterrupt: 

In [None]:
merged_model = trainer.model.merge_and_unload()

In [None]:
merged_model.push_to_hub(
    "SmolGRPO-135M", private=False, tags=["GRPO", "Reasoning-Course"]
)