## Use this script to load from checkpoint and save the checkpoint to huggingface

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

checkpoint = "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/checkpoint-17500"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=checkpoint,  # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.




ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
INFO 02-17 21:39:56 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {
        "role": "user",
        "content": "If a box of apples costs $15 and contains 30 apples, what is the cost of each apple? Also, if I want to buy 7 apples, how much would I need to pay?",
    },
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=2048,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
)

In [11]:
import time

import torch


def load_model(model_path):
    """Load model and tokenizer"""

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=checkpoint,  # or choose "unsloth/Llama-3.2-1B-Instruct"
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    return model, tokenizer


def generate_response(model, tokenizer, prompt):
    """Generate response for a given prompt"""
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            temperature=0.1,
            do_sample=True,
            use_cache=True,
        )
    end_time = time.time()

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generation_time = end_time - start_time
    return response, generation_time


def compare_models(base_path, finetuned_path, test_prompts):
    """Compare responses from base and finetuned models"""
    print("Loading models...")
    base_model, base_tokenizer = load_model(base_path)
    ft_model, ft_tokenizer = load_model(finetuned_path)

    FastLanguageModel.for_inference(base_model)
    FastLanguageModel.for_inference(ft_model)

    results = []
    for i, prompt in enumerate(test_prompts, 1):
        print(f"\nTest {i}/{len(test_prompts)}")
        print(f"Prompt: {prompt}\n")

        # Generate responses
        base_response, base_time = generate_response(base_model, base_tokenizer, prompt)
        ft_response, ft_time = generate_response(ft_model, ft_tokenizer, prompt)

        results.append(
            {
                "prompt": prompt,
                "base_response": base_response,
                "base_time": base_time,
                "ft_response": ft_response,
                "ft_time": ft_time,
            }
        )

        print(f"Base Model Response ({base_time:.2f}s):\n{base_response}\n")
        print(f"Finetuned Model Response ({ft_time:.2f}s):\n{ft_response}\n")
        print("-" * 80)

    return results


# Example usage
test_prompts = [
    "Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?",
]

base_model_path = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
finetuned_model_path = (
    "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/checkpoint-17500"
)

results = compare_models(base_model_path, finetuned_model_path, test_prompts)

Loading models...
==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Test 1/1
Prompt: Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?

Base Model Response (20.52s):
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?assistant

To find out how much Janet earns in a month, I need to calculate her total earnings for 4 weeks. Let's bre

In [7]:
import os

from dotenv import load_dotenv

load_dotenv()

model.push_to_hub_gguf(
    "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF",  # Change hf to your username!
    tokenizer,
    quantization_method=["not_quantized"],
    token=os.getenv("HF_TOKEN"),
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 338.84 out of 503.51 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:00<00:00, 119.61it/s]

Unsloth: Saving tokenizer...




 Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['bf16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF into bf16 GGUF format.
The output location will be /workspace/grpo_demo/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
I

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.BF16.gguf:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF


## GRPO

In [18]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Can increase for longer reasoning traces
lora_rank = 64  # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="outputs/2025-02-17/23-27-47/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF",
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # False for LoRA 16bit
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.4,  # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r=lora_rank,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Remove QKVO if out of memory
    lora_alpha=lora_rank,
    use_gradient_checkpointing="unsloth",  # Enable long context finetuning
    random_state=3407,
)

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: vLLM loading outputs/2025-02-17/23-27-47/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF with actual GPU utilization = 24.25%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 44.45 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 8.39 GB. Also swap space = 6 GB.
INFO 02-19 00:15:30 config.py:542] This model supports multiple tasks: {'generate', 'classify', 'reward', 'score', 'embed'}. Defaulting to 'generate'.
INFO 02-19 00:15:30 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='outputs/2025-02-17/23-27-47/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF', speculative_config=None, tokenizer='outputs/2025-02-17/23-27-47/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-19 00:15:39 model_runner.py:1115] Loading model weights took 2.3048 GB
INFO 02-19 00:15:40 worker.py:267] Memory profiling takes 0.62 seconds
INFO 02-19 00:15:40 worker.py:267] the current vLLM instance can use total_gpu_memory (44.45GiB) x gpu_memory_utilization (0.24) = 10.78GiB
INFO 02-19 00:15:40 worker.py:267] model weights take 2.30GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.03GiB; the rest of the memory reserved for KV Cache is 7.44GiB.
INFO 02-19 00:15:41 executor_base.py:110] # CUDA blocks: 15237, # CPU blocks: 12288
INFO 02-19 00:15:41 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 119.04x
INFO 02-19 00:15:45 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_u

Capturing CUDA graph shapes: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 31/31 [00:28<00:00,  1.11it/s]

INFO 02-19 00:16:13 model_runner.py:1562] Graph capturing finished in 28 secs, took 0.20 GiB
INFO 02-19 00:16:13 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 34.22 seconds





In [19]:
from unsloth.chat_templates import get_chat_template

SYSTEM_PROMPT = """
Respond in the following format:
<|begin_of_thought|>
...
<|end_of_thought|>
<|begin_of_solution|>
...
<|end_of_solution|>
  """

text = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "Calculate pi."},
    ],
    tokenize=False,
    add_generation_prompt=True,
)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=8192,
)
output = (
    model.fast_generate(
        [text],
        sampling_params=sampling_params,
        lora_request=None,
    )[0]
    .outputs[0]
    .text
)


print(output)

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:07<00:00,  7.55s/it, est. speed input: 10.34 toks/s, output: 151.79 toks/s]

<|begin_of_thought|>

Okay, so I need to calculate pi, right? Pi is the ratio of a circle's circumference to its diameter. Let me remember how to do that. Hmm. I think the formula is pi equals the circumference divided by the diameter. So, circumference is around a circle, which is 2Ï€r, and diameter is just 2r. So pi is 2Ï€ divided by 2r. That would be Ï€ = 2Ï€rÂ². Wait, let me check that again. If the radius is r, then diameter is 2r. So circumference is 2Ï€r, right? So Ï€ = 2Ï€rÂ². 

But maybe I should make sure I understand the units. Circumference is in units of length, and diameter is in units of length as well. Since pi is a ratio, it's usually expressed in terms of mathematical terms, not just numbers. So Ï€ should be calculated as the ratio of circumference to diameter. Let me verify with a simple example. If the diameter is 5 cm, then the radius is 2.5 cm. So the circumference is 2Ï€ * 2.5 = 5Ï€ cm. Then dividing by the diameter, which is 5 cm, gives 5Ï€/5 = Ï€ cm. That works




In [16]:
text = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "What is 5 + 3 - 1?"},
    ],
    tokenize=False,
    add_generation_prompt=True,
)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=8192,
)
output = (
    model.fast_generate(
        text,
        sampling_params=sampling_params,
        lora_request=model.load_lora(
            "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF"
        ),
    )[0]
    .outputs[0]
    .text
)

print(output)

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:03<00:00,  3.34s/it, est. speed input: 25.77 toks/s, output: 147.13 toks/s]

<|begin_of_thought|>

Okay, let's see. I need to figure out what 5 plus 3 minus 1 is. Hmm, let me break this down step by step. So, the problem is 5 plus 3 minus 1. Let me write it out: 5 + 3 - 1. 

First, I should probably add 5 and 3 first, then subtract 1. Let me add them first. 5 plus 3. That's straightforward, right? 5 plus 3 equals 8. So now the equation is 8 - 1. Then I need to subtract 1 from 8. Subtracting 1 from 8. Hmm, how do I do that? Let me think. 8 minus 1 is 7, so the answer should be 7. 

Wait, let me double-check to make sure I didn't make a mistake. If I add 5 and 3 first, that's 8. Then subtract 1 from 8, that's 7. Yeah, that seems right. I don't see any other way to approach this. Maybe breaking it down into smaller parts? Like, 5 + 3 would be 8, then 8 - 1 is 7. Yeah, that still gives the same result. 

I guess another way to think about it is to use order of operations, like PEMDAS. P is parentheses, E is exponents, M is multiplication and division, and D is addi




In [14]:
import os

base_model_path = "/workspace/grpo_demo/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-GGUF/unsloth.BF16.gguf"

if not os.path.exists(base_model_path):
    raise FileNotFoundError(f"Model file not found: {base_model_path}")

file_size = os.path.getsize(base_model_path)
print(f"Model file size: {file_size / (1024 * 1024):.2f} MB")

Model file size: 7.47 MB


In [26]:
import os
from dotenv import load_dotenv

load_dotenv()

model.push_to_hub_merged(
    "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-16bit",
    tokenizer,
    save_method="merged_16bit",
    token=os.getenv("HF_TOKEN"),
)

Unsloth: You are pushing to hub, but you passed your HF username = chriswhpang.
We shall truncate chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-16bit to Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-16bit


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 270.14 out of 503.51 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 16/16 [00:00<00:00, 126.02it/s]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.


  0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GRPO-16bit
