## Use this script to load from checkpoint and save the checkpoint to huggingface

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

checkpoint = "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/checkpoint-17500"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=checkpoint,  # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-17 21:39:56 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {
        "role": "user",
        "content": "If a box of apples costs $15 and contains 30 apples, what is the cost of each apple? Also, if I want to buy 7 apples, how much would I need to pay?",
    },
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=2048,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
)

In [11]:
import time

import torch


def load_model(model_path):
    """Load model and tokenizer"""

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=checkpoint,  # or choose "unsloth/Llama-3.2-1B-Instruct"
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    return model, tokenizer


def generate_response(model, tokenizer, prompt):
    """Generate response for a given prompt"""
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            temperature=0.1,
            do_sample=True,
            use_cache=True,
        )
    end_time = time.time()

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generation_time = end_time - start_time
    return response, generation_time


def compare_models(base_path, finetuned_path, test_prompts):
    """Compare responses from base and finetuned models"""
    print("Loading models...")
    base_model, base_tokenizer = load_model(base_path)
    ft_model, ft_tokenizer = load_model(finetuned_path)

    FastLanguageModel.for_inference(base_model)
    FastLanguageModel.for_inference(ft_model)

    results = []
    for i, prompt in enumerate(test_prompts, 1):
        print(f"\nTest {i}/{len(test_prompts)}")
        print(f"Prompt: {prompt}\n")

        # Generate responses
        base_response, base_time = generate_response(base_model, base_tokenizer, prompt)
        ft_response, ft_time = generate_response(ft_model, ft_tokenizer, prompt)

        results.append(
            {
                "prompt": prompt,
                "base_response": base_response,
                "base_time": base_time,
                "ft_response": ft_response,
                "ft_time": ft_time,
            }
        )

        print(f"Base Model Response ({base_time:.2f}s):\n{base_response}\n")
        print(f"Finetuned Model Response ({ft_time:.2f}s):\n{ft_response}\n")
        print("-" * 80)

    return results


# Example usage
test_prompts = [
    "Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?",
]

base_model_path = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
finetuned_model_path = (
    "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/checkpoint-17500"
)

results = compare_models(base_model_path, finetuned_model_path, test_prompts)

Loading models...
==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.448 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Test 1/1
Prompt: Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?

Base Model Response (20.52s):
system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Janet works part-time at a grocery store. She works 4 hours a day, 3 days a week, and earns $12 per hour. How much does she earn in a month (4 weeks)?assistant

To find out how much Janet earns in a month, I need to calculate her total earnings for 4 weeks. Let's bre

In [7]:
import os

from dotenv import load_dotenv

load_dotenv()

model.push_to_hub_gguf(
    "chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF",  # Change hf to your username!
    tokenizer,
    quantization_method=["not_quantized"],
    token=os.getenv("HF_TOKEN"),
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 338.84 out of 503.51 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 119.61it/s]

Unsloth: Saving tokenizer...




 Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['bf16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF into bf16 GGUF format.
The output location will be /workspace/grpo_demo/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
I

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.BF16.gguf:   0%|          | 0.00/7.83M [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/chriswhpang/Llama-3.2-1B-Instruct-OpenThought-SFT-GGUF
