In [1]:
!pip install transformers datasets huggingface-hub trl[vllm] wandb weave openai accelerate emoji -U

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-1.0.1-py3-none-any.whl.metadata (13 kB)
Collecting wandb
  Downloading wandb-0.22.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting weave
  Downloading weave-0.52.11-py3-none-any.whl.metadata (27 kB)
Collecting openai
  Downloading openai-2.6.1-py3-none-any.whl.metadata (29 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting trl[vllm]
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.10.23-cp312-cp312-manylinux2014_x8

In [2]:
%env VLLM_CONFIGURE_LOGGING=0

env: VLLM_CONFIGURE_LOGGING=0


In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

In [5]:
dataset = load_dataset("parquet", data_files="data/rl_df.parquet", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset[1]

{'prompt': "Generate the funniest possible joke that contains these two words: 'hammer', 'banana'. Return only the joke. "}

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from concurrent.futures import ThreadPoolExecutor
import gc
import torch
import json
import random
import re


model_name = "meta-llama/Llama-3.2-3B-Instruct"

scoring_tokenizer = AutoTokenizer.from_pretrained(model_name)
scoring_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map="auto"
)
scoring_model.generation_config.pad_token_id = scoring_tokenizer.pad_token_id

def generate_response(prompt, temperature=0.6, top_p=0.9, max_new_tokens=512):
    with torch.no_grad():
        inputs = scoring_tokenizer(prompt, return_tensors="pt").to(scoring_model.device)
        output = scoring_model.generate(
            **inputs,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            do_sample=True
        )
    return scoring_tokenizer.decode(output[0], skip_special_tokens=True)

def construct_prompt(persona, joke):
    return f"""
You are a person who enjoys {persona} humour. 
Do you think that the following joke is funny: \n{joke}\n
Reply with a valid JSON object that contains `final_answer` (either "yes" or "no") and `reason`, justifying your answer.
Only choose "yes" if the joke is really hilarious. Incomplete jokes or nonsense should never receive a "yes." Be honest and strict.
"""

def extract_json(text):
    text = text.strip().removeprefix("```json").removesuffix("```")
    try:
        return json.loads(text)
    except:
        # crude fallback
        text = text[text.find("{"):text.rfind("}")+1]
        return json.loads(text)

def get_crowd_score(joke):
    personas = [
        "self-defeating", 
        "affiliative", 
        "self-enhancing", 
        "aggressive", 
       # "wordplay", 
       # "anecdotal", 
       # "observational", 
    ]
    
    def ask(persona):
        prompt = construct_prompt(persona, joke)
        resp = generate_response(prompt)
        try:
            data = extract_json(resp)
            return 1.0 if data["final_answer"].lower() == "yes" else 0.0
        except Exception:
            return 0.0

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(ask, personas))
    torch.cuda.empty_cache()
    return sum(results)

def crowd_score(completions, **kwargs):
    with ThreadPoolExecutor(max_workers=4) as executor:
        return list(executor.map(get_crowd_score, completions))

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [8]:
def word_pair_prompt_adherence(completions, prompts, **kwargs):
    scores = []
    pattern = r"contains\s+these\s+two\s+words:\s*'([^']+)'\s*,\s*'([^']+)'"
    for i in range(len(completions)):
        p = prompts[i]
        if "two words" not in p:
            scores.append(None)
            continue
        c = completions[i].lower()
        w1, w2 = re.findall(pattern, p, flags=re.IGNORECASE)[0]
        if w1 not in c:
            scores.append(0.0)
        elif w2 not in c:
            scores.append(0.0)
        else:
            scores.append(1.0)

    return scores

def headline_adherence(completions, prompts, **kwargs):
    scores = []
    for i, completion in enumerate(completions):
        
        if "two words" in prompts[i]: # it is a word pair task sample
            scores.append(None)
            continue
        
        if len(completion.split()) <= 25: # roughly the max length of the task
            if "headline" in completion:
                scores.append(0.0)
            else:
                scores.append(1.0)
        else:
            scores.append(0.0)
    return scores

In [9]:
import emoji

def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

def formatting(completions, **kwargs):
    scores = []
    for completion in completions:
        if ("#" in completion
            or "How about: " in completion
            or "This joke" in completion
            or "Let me know" in completion
            or "Note: " in completion
            or contains_emoji(completion)
           ):
            scores.append(0.0)
        else:
            scores.append(1.0)
    return scores

def temperature(completions, **kwargs):
    scores = []
    for c in completions:
        random_reward = random.randint(0, 2)
        scores.append(random_reward)
    return scores

In [10]:
training_args = GRPOConfig(
    output_dir="Qwen/Qwen2.5-0.5B-Instruct-GRPO", 
    report_to="wandb",
    num_train_epochs=1,
    use_vllm=True,
    vllm_mode="colocate",
    save_strategy="no",
    max_completion_length=64,
    temperature=0.7,
)

trainer = GRPOTrainer(
    model="Qwen/Qwen2.5-0.5B-Instruct",
    reward_funcs=[crowd_score, headline_adherence, word_pair_prompt_adherence, formatting, temperature],
    args=training_args,
    train_dataset=dataset,
)

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 5/5 [00:00<00:00, 51.87it/s]


In [None]:
import weave
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jovyan/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkonrad-brg[0m ([33mkonrad-brg-university-of-t-bingen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Initializing weave.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pa

Step,Training Loss
10,0.0019
20,0.028
30,0.0354
40,0.0053
50,0.0186
60,0.0157
70,0.012
80,0.0152
90,0.016
100,-0.0006


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [None]:
model = trainer.model

In [None]:
tokenizer = trainer.processing_class

In [None]:
prompt = "Generate a funny joke that contains the words 'microwave' and 'shoes'."
with torch.no_grad():
    inputs = tokenizer(prompt, return_tensors="pt").to(scoring_model.device)
    output = model.generate(
        **inputs,
        do_sample=True
    )
print(tokenizer.decode(output[0], skip_special_tokens=True))