In [None]:
!pip install datasets huggingface-hub trl[vllm] wandb weave accelerate emoji -U

In [None]:
!pip install transformers -U

In [None]:
#!pip install sentence-transformers

In [1]:
%env VLLM_CONFIGURE_LOGGING=0

env: VLLM_CONFIGURE_LOGGING=0


In [None]:
from huggingface_hub import login
login()

In [2]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

In [3]:
train_dataset = load_dataset("parquet", data_files="data/rl_df_train.parquet", split="train")
test_dataset = load_dataset("parquet", data_files="data/rl_df_test.parquet", split="train")

In [4]:
train_dataset[1]

{'prompt': "Generate the funniest possible joke that contains these two words: 'learn', 'flower'. Return only the joke. "}

In [5]:
def is_valid_single_joke(text):
    """
    Combat joke stacking.
    """
    if text.count("?") > 1:
        return False
    if text.lower().count("why ") > 1:
        return False
    if "Q:" in text.lower() or "A:" in text.lower():
        return False
    if len(text.strip().split('\n')) > 3:
        return False
    
    return True

In [6]:
import numpy as np
from collections import deque
import re

recent_structures = deque(maxlen=30)

def extract_joke_structure(joke: str) -> str:
    joke_lower = joke.lower()
    
    if re.search(r"why\s+(?:did|didn't|do|dont|don't|were|weren't|was|wasn't)\s+\w+", joke_lower):
        return "why-did"
    elif re.search(r"where\s+(?:did|do|were|was)\s+\w+", joke_lower):
        return "where-did"
    elif re.search(r"what\s+do\s+you\s+call", joke_lower):
        return "what-do-you-call"
    elif re.search(r"knock\s+knock", joke_lower):
        return "knock-knock"
    elif joke.count("?") == 1 and (joke.count("!") >= 1 or joke.count(".") >= 1):
        return "qa-punchline"
    elif any(phrase in joke_lower for phrase in [" is like ", " is when "]):
        return "observation"
    else:
        return "one-liner"

def structure_diversity_reward(completions, **kwargs):
    global recent_structures
    scores = []
    freq = {}
    
    for s in recent_structures:
        freq[s] = freq.get(s, 0) + 1

    total = max(len(recent_structures), 1)
    num_structures = len(freq) if freq else 1
    target = 1 / num_structures

    for joke in completions:
        s = extract_joke_structure(joke)
        recent_structures.append(s)
        actual = freq.get(s, 0) / total
        reward = target - actual
        scores.append(reward)
        freq[s] = freq.get(s, 0) + 1

    return scores

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

scoring_pipe = pipeline("text-classification", model="KonradBRG/RoBERTA-Joke-Rater", trust_remote_code=True)
        
def roberta_score(completions, **kwargs):
    """
    Scores humor using RoBERTA, but only for valid outputs.
    Invalid outputs get 0.0 regardless of what the model thinks.
    """
    scores = []
    roberta_labels = scoring_pipe(completions)
    for i, completion in enumerate(completions):
        roberta_score = 0.0
        if is_valid_single_joke(completion):
            roberta_label = roberta_labels[i]
            roberta_score = roberta_label["label"][-1]
            roberta_score = float(roberta_score)
        scores.append(roberta_score)
    return scores

Device set to use cuda:0


In [9]:
import re

def word_pair_prompt_adherence(completions, prompts, **kwargs):
    """
    Enforces the word pair constraint.
    """
    scores = []
    pattern = r"contains\s+these\s+two\s+words:\s*'([^']+)'\s*,\s*'([^']+)'"
    for i in range(len(completions)):
        p = prompts[i]
        if "two words" not in p:
            scores.append(None)
            continue
        
        c = completions[i].lower()
        try:
            w1, w2 = re.findall(pattern, p, flags=re.IGNORECASE)[0]
        except IndexError:
            scores.append(None)
            continue
        
        words_in_completion = set(c.split())
        w1_lower = w1.lower()
        w2_lower = w2.lower()
        
        w1_found = any(w1_lower in word.lower() for word in words_in_completion)
        w2_found = any(w2_lower in word.lower() for word in words_in_completion)
        
        if w1_found and w2_found:
            scores.append(2.0)
        elif w1_found or w2_found:
            scores.append(-1.0)
        else:
            scores.append(-2.0)
    
    return scores

def headline_adherence(completions, prompts, **kwargs):
    scores = []
    for i, completion in enumerate(completions):
        
        if "two words" in prompts[i]: # it is a word pair task sample
            scores.append(None)
            continue
        
        if len(completion.split()) <= 25: # roughly the max length of the task
            if "headline" in completion:
                scores.append(-1.0)
            else:
                scores.append(1.0)
        else:
            scores.append(-1.0)
    return scores

In [10]:
import emoji

def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

def formatting(completions, **kwargs):
    """
    Validates output formatting and penalizes hacking patterns.
    """
    scores = []
    for completion in completions:
        if ("#" in completion
            or "How about: " in completion
            or "This joke" in completion
            or "Let me know" in completion
            or "Note: " in completion
            or "   " in completion
        ):
            scores.append(-1.0)
        elif contains_emoji(completion):
            scores.append(-1.0)
        elif not is_valid_single_joke(completion):
            scores.append(-1.0)
        else:
            scores.append(1.0)
    
    return scores

def length_penalty(completions, **kwargs):
    """
    Penalizes longer outputs to discourage joke stacking.
    """
    scores = []
    optimal_length = 16
    max_allowed = 24
    
    for completion in completions:
        word_count = len(completion.split())
        
        if word_count > max_allowed:
            scores.append(-2.0)
        elif word_count < 5:
            scores.append(-2.0)
        else:
            # smooth penalty for being over optimal length
            deviation = max(0, word_count - optimal_length)
            penalty = -0.2 * deviation
            scores.append(penalty)
    
    return scores

def compute_coherence_penalty(joke: str, penalty_weight: float = 0.5) -> float:
    """Penalize incoherent jokes with rare/technical terms"""
    rare_word_pattern = r'\b[A-Z][a-z]*(?:-[a-z]+)*\b'
    rare_words = len(re.findall(rare_word_pattern, joke))
    
    words = joke.split()
    if len(words) > 0:
        rare_word_ratio = rare_words / len(words)
        if rare_word_ratio > 0.2:  # >20% rare words = suspicious
            return -penalty_weight * (rare_word_ratio - 0.2)
    
    return 0.0

def coherence_penalty(completions, **kwargs):
    return [compute_coherence_penalty(c) for c in completions]

In [11]:
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
reward_fns = [
    roberta_score,
    structure_diversity_reward,
    word_pair_prompt_adherence,
    formatting,
    length_penalty,
    headline_adherence,
    coherence_penalty
]
reward_weights = [1.0, 1.5, 2.0, 0.5, 0.5, 2.0, 0.5]

training_args = GRPOConfig(
    output_dir="Qwen2.5-1.5B-Instruct-GRPO", 
    report_to="wandb",
    num_train_epochs=3,
    use_vllm=True,
    vllm_mode="colocate",
    eval_strategy="epoch",
    save_strategy="no",
    max_completion_length=64,
    temperature=0.7,
    reward_weights=reward_weights,
)

trainer = GRPOTrainer(
    model=model_id,
    reward_funcs=reward_fns,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

`torch_dtype` is deprecated! Use `dtype` instead!


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 5/5 [00:00<00:00, 53.07it/s]


In [None]:
import weave
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkonrad-brg[0m ([33mkonrad-brg-university-of-t-bingen[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Initializing weave.


Epoch,Training Loss,Validation Loss


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
import torch

model = trainer.model
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
prompt = "Generate a funny joke that contains the words 'microwave' and 'shoes'."

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(
    **inputs,
    max_length=50,
    temperature=0.8,
)
print(tokenizer.decode(output[0], skip_special_tokens=True))