In [20]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
# export WANDB_API_KEY="5a69225ea1d050c9c21f67c2db85febf61fa8fb1"

import wandb

wandb.login(key="5a69225ea1d050c9c21f67c2db85febf61fa8fb1")



True

In [21]:
dataset = load_dataset("mlabonne/smoltldr")

model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="eager"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

lora_config = LoraConfig(
    task_type = "CAUSAL_LM",
    r = 16, # rank of the surrogate matrices!
    lora_alpha = 32, # scale factor controlling the impact of the modifications
    target_modules = "all-linear" # applied to all linear transformations in the model
)
model = get_peft_model(model, lora_config)

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 200
    })
})

In [23]:
tokenizer

GPT2TokenizerFast(name_or_path='HuggingFaceTB/SmolLM-135M-Instruct', vocab_size=49152, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<repo_name>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<reponame>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	5: AddedToken("<file_sep>",

In [30]:
ideal_length = 50
def reward_function(prompt, answer, completions, **kwargs):
    print("PROMPT:", prompt)
    print("GOLDEN:", answer)
    print("HAND-MADE:", completions)
    return [-abs(ideal_length - len(completion)) for completion in completions]

In [31]:
reward_function(dataset['test']['prompt'][0], dataset['test']['completion'][0], "")

PROMPT: SUBREDDIT: r/relationship_advice

TITLE: I am a girl (18) that wants to let a guy (18) know that im hurt.

POST: OK so theres this guy I had sex with earlier this year twice and he was really nice to me and everything . But then he rooted and booted me. Fast forward maybe 5 months and he calls me up one night when, drunk, wanting to see me. I saw him at a party the week before so maybe I got in his head again. Either way he rooted and booted me again and hasn't talk to me since and when I tried it was just him and his mates being silly and saying stuff. Like it's pretty much impossible to see him when I want to as he's a player and is always with the boys.

So what I want to know is, next time he calls me up wanting a booty call or whatever I need to know what to say to let him know that he really really hurt me and it hurts to be had sex with and then him not care? Do any guys know a way that will make him actually CARE and get the picture.

TL;DR:
GOLDEN:  had sex with a play

[]

In [25]:
training_args = GRPOConfig(
    output_dir = "GRPO",
    learning_rate = 2e-5,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    max_prompt_length = 512, 
    max_completion_length = 96,
    num_generations = 4, # i.e. number of competitive completions considered during optimization
    optim = "adamw_8bit",
    num_train_epochs = 1,
    bf16 = True, 
    report_to = ["wandb"],
    remove_unused_columns = False, 
    logging_steps = 1
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_function],
    args=training_args,
    train_dataset=dataset["test"],
)

wandb.init(project="GRPO")
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: reward_function() missing 2 required positional arguments: 'prompt' and 'answer'

In [None]:
merged_model = trainer.model.merge_and_unload()

In [None]:
merged_model.push_to_hub(
    "SmolGRPO-135M", private=False, tags=["GRPO", "Reasoning-Course"]
)