In [None]:
!pip install transformers datasets trl[vllm] wandb weave openai accelerate -U

In [1]:
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer



INFO 10-27 20:35:05 [__init__.py:216] Automatically detected platform cuda.


In [2]:
dataset = load_dataset("parquet", data_files="data/rl_df.parquet", split="train")

In [3]:
dataset[1100]

{'prompt': "Generate a funny joke related to this headline: 'Electricity costs jolt New Jersey's race for governor — and preview next year's midterms' by either modifying it or responding to it."}

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from concurrent.futures import ThreadPoolExecutor
import gc
import torch
import json


model_name = "Qwen/Qwen2.5-0.5B-Instruct"

scoring_tokenizer = AutoTokenizer.from_pretrained(model_name)
scoring_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    device_map="auto"
)

def generate_response(prompt, temperature=0.6, top_p=0.9, max_new_tokens=512):
    with torch.no_grad():
        inputs = scoring_tokenizer(prompt, return_tensors="pt").to(scoring_model.device)
        output = scoring_model.generate(
            **inputs,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
            do_sample=True
        )
    return scoring_tokenizer.decode(output[0], skip_special_tokens=True)

def construct_prompt(persona, joke):
    return f"""
You are a person who enjoys {persona} humour. 
Do you think the following joke is funny: \n{joke}\n
Reply with a valid JSON object that contains `final_answer` (either "yes" or "no") and `reason`.
"""

def extract_json(text):
    text = text.strip().removeprefix("```json").removesuffix("```")
    try:
        return json.loads(text)
    except:
        # crude fallback
        text = text[text.find("{"):text.rfind("}")+1]
        return json.loads(text)

def get_crowd_score(joke):
    personas = ["self-defeating", "affiliative", "self-enhancing", "aggressive"]

    def ask(persona):
        prompt = construct_prompt(persona, joke)
        resp = generate_response(prompt)
        try:
            data = extract_json(resp)
            return 1.0 if data["final_answer"].lower() == "yes" else 0.0
        except Exception:
            return 0.0

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(ask, personas))
    
    torch.cuda.empty_cache()
    return sum(results)

def crowd_score_rewards(completions, **kwargs):
    with ThreadPoolExecutor(max_workers=4) as executor:
        return list(executor.map(get_crowd_score, completions))

In [5]:
training_args = GRPOConfig(
    output_dir="Qwen/Qwen2.5-0.5B-Instruct-GRPO", 
    report_to="wandb",
    num_train_epochs=1,
    use_vllm=True,
    vllm_mode="colocate",
    save_strategy="no",
)

trainer = GRPOTrainer(
    model="Qwen/Qwen2.5-0.5B-Instruct",
    reward_funcs=crowd_score_rewards,
    args=training_args,
    train_dataset=dataset,
)

INFO 10-27 20:35:12 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 768, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.3, 'max_num_batched_tokens': 4096, 'max_num_seqs': 8, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'Qwen/Qwen2.5-0.5B-Instruct'}
INFO 10-27 20:35:17 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 10-27 20:35:17 [__init__.py:1815] Using max model len 768
INFO 10-27 20:35:17 [parallel.py:348] Disabling V1 multiprocessing for external launcher.
INFO 10-27 20:35:18 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 10-27 20:35:18 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=Obse



INFO 10-27 20:35:20 [gpu_model_runner.py:2370] Loading model from scratch...
INFO 10-27 20:35:20 [cuda.py:362] Using Flash Attention backend on V1 engine.
INFO 10-27 20:35:20 [weight_utils.py:348] Using model weights format ['*.safetensors']
INFO 10-27 20:35:21 [weight_utils.py:406] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 10-27 20:35:21 [default_loader.py:268] Loading weights took 0.18 seconds
INFO 10-27 20:35:21 [gpu_model_runner.py:2392] Model loading took 0.9286 GiB and 0.750990 seconds
INFO 10-27 20:35:24 [backends.py:539] Using cache directory: /home/jovyan/.cache/vllm/torch_compile_cache/29155805d5/rank_0_0/backbone for vLLM's torch.compile
INFO 10-27 20:35:24 [backends.py:550] Dynamo bytecode transform time: 2.62 s
INFO 10-27 20:35:25 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 0.879 s
INFO 10-27 20:35:26 [monitor.py:34] torch.compile takes 2.62 s in total
INFO 10-27 20:35:26 [gpu_worker.py:298] Available KV cache memory: 12.27 GiB
INFO 10-27 20:35:26 [kv_cache_utils.py:864] GPU KV cache size: 1,071,760 tokens
INFO 10-27 20:35:26 [kv_cache_utils.py:868] Maximum concurrency for 768 tokens per request: 1395.52x


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 5/5 [00:00<00:00, 61.42it/s]


INFO 10-27 20:35:27 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.09 GiB
INFO 10-27 20:35:27 [gpu_worker.py:391] Free memory on device (41.15/44.52 GiB) on startup. Desired GPU memory utilization is (0.3, 13.36 GiB). Actual usage is 0.93 GiB for weight, 0.15 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.09 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=12920272896` to fit into requested memory, or `--kv-cache-memory=42762254336` to fully utilize gpu memory. Current kv cache memory in use is 13169833984 bytes.
INFO 10-27 20:35:27 [core.py:218] init engine (profile, create kv cache, warmup model) took 5.85 seconds
INFO 10-27 20:35:28 [llm.py:295] Supported_tasks: ('generate',)
INFO 10-27 20:35:28 [__init__.py:36] No IOProcessor plugins requested by the model


In [None]:
import weave
trainer.train()

Step,Training Loss


INFO 10-27 20:36:19 [block_pool.py:292] Successfully reset prefix cache
INFO 10-27 20:36:55 [block_pool.py:292] Successfully reset prefix cache
INFO 10-27 20:37:25 [block_pool.py:292] Successfully reset prefix cache
INFO 10-27 20:37:57 [block_pool.py:292] Successfully reset prefix cache
INFO 10-27 20:38:27 [block_pool.py:292] Successfully reset prefix cache
INFO 10-27 20:38:54 [block_pool.py:292] Successfully reset prefix cache


In [None]:
model = trainer.model

In [None]:
tokenizer = trainer.processing_class