In [1]:
# Imports

import os

os.environ["TRANSFORMERS_CACHE"] = "/hpc/home/bfa6/work/llms/.cache"
os.environ["HF_HOME"] = "/hpc/home/bfa6/work/llms/.cache"

import time
import json

from unsloth import FastLanguageModel
import torch
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import dotenv_values
from tqdm import tqdm
import asyncio
from tqdm.asyncio import tqdm_asyncio
from google import genai
import random
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer
import vllm
import nltk

config = dotenv_values("../.env")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.




INFO 11-13 18:10:17 [__init__.py:216] Automatically detected platform cuda.
ü¶• Unsloth Zoo will now patch everything to make training faster!


# Load the model

In [2]:
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-0.6B",
    cache_dir="/hpc/home/bfa6/work/llms/.cache",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
    force_download=True
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

INFO 11-13 18:10:35 [vllm_utils.py:700] Unsloth: Patching vLLM v1 graph capture
==((====))==  Unsloth 2025.11.2: Fast Qwen3 patching. Transformers: 4.56.2. vLLM: 0.11.0.
   \\   /|    NVIDIA RTX 5000 Ada Generation. Num GPUs = 1. Max memory: 31.473 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/Qwen3-0.6B with actual GPU utilization = 89.01%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 31.47 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 27.03 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `device` is not supported in vLLM. Skipping.
INFO 11-13 18:10:38 [utils.py:233] non-def

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-13 18:10:38 [model.py:1510] Using max model len 2048
INFO 11-13 18:10:41 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 11-13 18:10:42 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='unsloth/Qwen3-0.6B', speculative_config=None, tokenizer='unsloth/Qwen3-0.6B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-13 18:10:46 [default_loader.py:267] Loading weights took 1.99 seconds
INFO 11-13 18:10:46 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 11-13 18:10:47 [gpu_model_runner.py:2653] Model loading took 1.1649 GiB and 3.078017 seconds
INFO 11-13 18:10:55 [backends.py:548] Using cache directory: /hpc/home/bfa6/.cache/vllm/torch_compile_cache/3f1f7925d5/rank_0_0/backbone for vLLM's torch.compile
INFO 11-13 18:10:55 [backends.py:559] Dynamo bytecode transform time: 6.81 s


Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:01<00:00,  4.61it/s, triton_poi_fused_view_6]                             


INFO 11-13 18:11:03 [backends.py:197] Cache the graph for dynamic shape for later use


Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:02<00:00,  5.37it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 136.36it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 124.98it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 126.76it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 120.81it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 125.33it/s, triton_poi_fused_view_10]                         
Unsloth: Compiling kernels: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 128.57it/s, triton_poi_fused

INFO 11-13 18:11:43 [backends.py:218] Compiling a graph for dynamic shape takes 47.70 s





INFO 11-13 18:12:14 [monitor.py:34] torch.compile takes 54.51 s in total
INFO 11-13 18:12:16 [gpu_worker.py:298] Available KV cache memory: 25.08 GiB
INFO 11-13 18:12:16 [kv_cache_utils.py:1087] GPU KV cache size: 234,784 tokens
INFO 11-13 18:12:16 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 114.64x
INFO 11-13 18:12:16 [vllm_utils.py:705] Unsloth: Running patched vLLM v1 `capture_model`.


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 67/67 [00:18<00:00,  3.58it/s]
Capturing CUDA graphs (decode, FULL): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:03<00:00, 11.09it/s]

INFO 11-13 18:12:39 [gpu_model_runner.py:3480] Graph capturing finished in 23 secs, took 0.98 GiB
INFO 11-13 18:12:39 [vllm_utils.py:712] Unsloth: Patched vLLM v1 graph capture finished in 23 secs.





INFO 11-13 18:12:40 [core.py:210] init engine (profile, create kv cache, warmup model) took 113.02 seconds
INFO 11-13 18:12:41 [llm.py:306] Supported_tasks: ('generate',)
Unsloth: Just some info: will skip parsing ['ffn_norm', 'norm1', 'post_layernorm', 'q_norm', 'attention_norm', 'norm2', 'k_norm', 'input_layernorm', 'post_attention_layernorm', 'pre_feedforward_layernorm', 'post_feedforward_layernorm', 'layer_norm2', 'norm', 'layer_norm1']
Performing substitution for additional_keys=set()
Unsloth: Just some info: will skip parsing ['ffn_norm', 'norm1', 'post_layernorm', 'q_norm', 'attention_norm', 'cross_attn_post_attention_layernorm', 'norm2', 'k_norm', 'input_layernorm', 'post_attention_layernorm', 'pre_feedforward_layernorm', 'post_feedforward_layernorm', 'layer_norm2', 'norm', 'layer_norm1', 'cross_attn_input_layernorm']


Unsloth 2025.11.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Create a chat template

In [3]:
system_prompt = (
    "You are given some context.\n"
    "Your goal is to compress the information in the context and output the compressed version.\n"
    "Use as few tokens as possible while keeping all information.\n"
    "Do NOT produce internal chain-of-thought or step-by-step reasoning.\n"
    "Start immediately with the compressed content (no extra preface)."
)
system_prompt

'You are given some context.\nYour goal is to compress the information in the context and output the compressed version.\nUse as few tokens as possible while keeping all information.\nDo NOT produce internal chain-of-thought or step-by-step reasoning.\nStart immediately with the compressed content (no extra preface).'

In [4]:
# chat_template = \
#     "{% if messages[0]['role'] == 'system' %}"\
#         "{{ messages[0]['content'] + eos_token }}"\
#         "{% set loop_messages = messages[1:] %}"\
#     "{% else %}"\
#         "{{ '{system_prompt}' + eos_token }}"\
#         "{% set loop_messages = messages %}"\
#     "{% endif %}"\
#     "{% for message in loop_messages %}"\
#         "{% if message['role'] == 'user' %}"\
#             "{{ '<|user|>\\n' + message['content'] + eos_token }}"\
#         "{% elif message['role'] == 'assistant' %}"\
#             "{{ '<|assistant|>\\n' + message['content'] + eos_token }}"\
#         "{% endif %}"\
#     "{% endfor %}"

# # Replace with out specific template:
# chat_template = chat_template\
#     .replace("'{system_prompt}'",   f"'{system_prompt}'")

# from unsloth.chat_templates import qwen3_template
# tokenizer.chat_template = qwen3_template

tokenizer.chat_template


'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0].role == \'system\' %}\n        {{- messages[0].content + \'\\n\\n\' }}\n    {%- endif %}\n    {{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0].role == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0].content + \'<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for forward_message 

# Prepare dataset

In [5]:
# load dataset
with open("/hpc/home/bfa6/work/github/yapper/dataset/chunks.json", "r") as f:
    dataset =  json.load(f)


len(dataset)

18914

In [6]:
# make sure `dataset` is a list or indexable sequence of length 18_914
random.Random(42).shuffle(dataset)  # reproducible shuffle

n_total = len(dataset)
n_train = 15_131
n_eval  = 1_891
n_test  = 1_892

train_dataset = dataset[:n_train]
eval_dataset  = dataset[n_train:n_train + n_eval]
test_dataset  = dataset[n_train + n_eval:n_train + n_eval + n_test]

print(len(train_dataset), len(eval_dataset), len(test_dataset))
# ‚Üí 15131 1891 1892


15131 1891 1892


In [7]:
test = [{"role":"system", "content": system_prompt}, {"role":"user", "content": train_dataset[0]["chunk"]}]

In [8]:
train_dataset[0]

{'chunk': '‚ÄúIt would not be easy,‚Äù Bill thought, feeling that she did not know\nmuch about the subjects of greatest interest to the ladies present;\nbut then, as she soon found, Miss Dawson did not either, and so wisely\nconfined herself to entertaining the men. Bill did not feel very\nhopeful of her own powers in that direction, and before she could\nmake any definite plans her thoughts were interrupted by Mr. Dane‚Äôs\nentrance into the drawing-room to which everyone had now returned.\nMr. Dane never joined these parties till after tea, on the excuse of\nparish-work. After the little disturbance created by his entrance had\nsubsided, and he had shaken hands with everybody, Bill found that he\nhad taken the chair next to her. She knew that he wanted to hear if\nshe had been to Wood Hall, and she was quite ready to tell him. It was\neasy enough to do this unnoticed in the buzz of general conversation;\nand accordingly she told him how she and Polly had driven to Wood\nHall, how Pol

In [9]:
# Test model

text = tokenizer.apply_chat_template(
    test,
    tokenize = False,
    add_generation_prompt=True,
    enable_thinking=False     
)

inputs = tokenizer(text, return_tensors = "pt").to("cuda")

outputs = model.generate(
    **inputs,
    temperature = 0.7,
    top_p=0.8,
    top_k=20,
    min_p=0.0,
    max_new_tokens = 2048,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

"Mr. Dane did not mean it."<|im_end|>


In [10]:
output = outputs[:, inputs["input_ids"].shape[1]:]

output_text = tokenizer.decode(output[0], skip_special_tokens=True)

output_text

'"Mr. Dane did not mean it."'

In [11]:
chunk_tokens = tokenizer(train_dataset[0]["chunk"])

tokenizer.decode(chunk_tokens["input_ids"], skip_special_tokens=True)

'‚ÄúIt would not be easy,‚Äù Bill thought, feeling that she did not know\nmuch about the subjects of greatest interest to the ladies present;\nbut then, as she soon found, Miss Dawson did not either, and so wisely\nconfined herself to entertaining the men. Bill did not feel very\nhopeful of her own powers in that direction, and before she could\nmake any definite plans her thoughts were interrupted by Mr. Dane‚Äôs\nentrance into the drawing-room to which everyone had now returned.\nMr. Dane never joined these parties till after tea, on the excuse of\nparish-work. After the little disturbance created by his entrance had\nsubsided, and he had shaken hands with everybody, Bill found that he\nhad taken the chair next to her. She knew that he wanted to hear if\nshe had been to Wood Hall, and she was quite ready to tell him. It was\neasy enough to do this unnoticed in the buzz of general conversation;\nand accordingly she told him how she and Polly had driven to Wood\nHall, how Polly had wai

In [12]:
print(f"""The number of tokens in the input was {len(chunk_tokens["input_ids"])}""")
print(f"The number of tokens in the output was {len(output[0])-1}")

The number of tokens in the input was 556
The number of tokens in the output was 9


In [13]:
# Train Dataset
trainset = [
    {"prompt": sample["chunk"]} for sample in train_dataset
]
trainset = Dataset.from_list(trainset)
trainset = trainset.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["prompt"]},
    ],
})

Map:   0%|          | 0/15131 [00:00<?, ? examples/s]

In [14]:
# Eval Dataset
evalset = [
    {"prompt": sample["chunk"]} for sample in eval_dataset
]
evalset = Dataset.from_list(evalset)
evalset = evalset.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["prompt"]},
    ],
})

Map:   0%|          | 0/1891 [00:00<?, ? examples/s]

# GRPO

## Reconstruction

In [15]:
decoding_system_prompt = (
    "You are given compressed context created by another model.\n"
    "Your goal is to accurately reconstruct the original uncompressed content.\n"
    "Expand all abbreviated, shortened, or implied information back to its full form.\n"
    "Ensure that no information is lost or altered from the original meaning.\n"
    "Do NOT include any reasoning or commentary ‚Äî only output the reconstructed content."
)

In [16]:
def recontruct_input(output):

    messages = [
        {"role": "system", "content": decoding_system_prompt},
        {"role": "user", "content": output}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt=True,
        enable_thinking=False     
    )

    inputs = tokenizer(text, return_tensors = "pt").to("cuda")

    resp = model.generate(
        **inputs,
        temperature = 0.7,
        top_p=0.8,
        top_k=20,
        min_p=0.0,
        max_new_tokens = 2048,
    )

    reconstructed_tokens = resp[:, inputs["input_ids"].shape[1]:]

    reconstructed_text = tokenizer.decode(reconstructed_tokens[0], skip_special_tokens=True)

    return reconstructed_text






## Reward formulation

In [17]:
def get_num_tokens(text: str):
    chunk_tokens = tokenizer(text)
    return float(len(chunk_tokens["input_ids"]))



def get_bleu_score(original, reconstructed):
    hypothesis = reconstructed.split()
    reference = original.split()
    
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
    return BLEUscore - 1

def get_length_reward(original, compressed):
    len_original = get_num_tokens(original)
    len_compressed = get_num_tokens(compressed)

    r = (len_original - len_compressed) * (1/len_original)

    return r
    

def calculate_rewards(prompts, completions, alpha: float = 0.8, **kwargs):
    chunk = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    rewards = []

    for response in responses:
        # First calculate r_len
        r_len = get_length_reward(chunk, response)

        # Now reconstruct input
        reconstructed = recontruct_input(response)

        r_bleu = get_bleu_score(chunk ,reconstructed)

        r_final = alpha * r_bleu + (1-alpha) * r_len

        rewards.append(r_final)

    return rewards



## Train the model

In [18]:
max_prompt_length = 1024 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from vllm import SamplingParams
vllm_sampling_params = SamplingParams(
    min_p = 0,
    top_p = 0.8,
    top_k = 20,
    seed = 3407,
    stop = [tokenizer.eos_token],
    include_stop_str_in_output = True,
)

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    vllm_sampling_params = vllm_sampling_params,
    temperature = 0.7,
    learning_rate = 5e-6,
    weight_decay = 0.001,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 1500,
    save_steps = 300,
    report_to = "none", # Can use Weights & Biases
    output_dir = "/hpc/home/bfa6/work/github/yapper/results/test",

    # For optional training + evaluation
    fp16_full_eval = True,
    per_device_eval_batch_size = 4,
    eval_accumulation_steps = 1,
    eval_strategy = "steps",
    eval_steps = 300,
)

In [None]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        calculate_rewards
    ],
    args = training_args,
    # train_dataset = dataset,

    # For optional training + evaluation
    train_dataset = trainset,
    eval_dataset = evalset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 15,131 | Num Epochs = 1 | Total steps = 1,500
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 20,185,088 of 616,235,008 (3.28% trained)


