In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 4096,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.216 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.6.5 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


# ROT13 Dataset

In [4]:
from trl import apply_chat_template

In [5]:
model_name = "Qwen/Qwen3-14B"
from config import storage_dir
from datasets import load_from_disk
model_storage_dir = os.path.join(storage_dir, "lm_sys", model_name.split("/")[-1])
dataset = load_from_disk(os.path.join(model_storage_dir, 'lm_sys_responses_rot13'))

def format_conversation_for_sft(example):
    """Format conversation for SFT training"""
    convo = tokenizer.apply_chat_template(
        example['conversation'],
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    return {"text": convo}

sft_dataset = dataset.map(format_conversation_for_sft, remove_columns=dataset.column_names)


In [6]:
num_samples = len(sft_dataset)
train_dataset = sft_dataset.select(range(0, num_samples - 500))
eval_dataset = sft_dataset.select(range(num_samples - 500, num_samples))

# Training


In [13]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset, # Can set up evaluation!

    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 40,
        gradient_accumulation_steps = 1, #20, # Use GA to mimic batch size!
        warmup_steps = 10, # 1-5% of total steps.
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 40,
        learning_rate = 1e-4, #2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 40 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (40 x 1 x 1) = 40
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


Step,Training Loss
1,1.7613
2,1.5891
3,1.8134
4,1.5745
5,1.4
6,1.6912
7,1.5788
8,1.5164
9,1.6378
10,1.3917


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.12 GiB. GPU 0 has a total capacity of 79.22 GiB of which 164.38 MiB is free. Including non-PyTorch memory, this process has 79.04 GiB memory in use. Of the allocated memory 78.09 GiB is allocated by PyTorch, and 166.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
del trainer
import gc
gc.collect()
with torch.no_grad():
    torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()

# Inference

In [None]:
torch.cuda.empty_cache() # Free up memory

In [None]:
messages = [
    {"role" : "user", "content" : "how can identity protection services help protect me against identity theft"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 100, # Increase for longer outputs!
    do_sample=False,
    #temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Vqragvgl cebgrpgvba freivprf ner qrfvtarq gb uryc lbh fnsrthneq lbhe crefbany vasbezngvba naq erqhpr gur evfx bs vqragvgl gursg. Gurfr freivprf pna or rfcrpvnyyl inyhnoyr va gbqnl'f qvtvgny ntr, jurer crefbany qngn


In [None]:
base_model, _ = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 4096,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

==((====))==  Unsloth 2025.6.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 4. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
messages = [
    {"role" : "user", "content" : 'how can identity protection services help protect me against identity theft'}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = base_model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 100, # Increase for longer outputs!
    do_sample=False,
    #temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Identity protection services can help protect you against identity theft in several key ways. These services are designed to monitor, detect, and respond to potential threats to your personal information. Here's how they can help:

---

### 1. **Monitor Your Personal Information**
Identity protection services continuously monitor your personal data across the web, including:
- **Credit reports** (checking for new accounts opened in your name)
- **Public records** (like court filings, property records, or marriage licenses)
- **


In [None]:
messages = [
    {"role" : "user", "content" : 'what is the capital of the state in which the president of the United States in 2013 was born?'}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = base_model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 100, # Increase for longer outputs!
    do_sample=False,
    #temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


To determine the capital of the state in which the President of the United States in 2013 was born, we need to identify who that president was and where they were born.

In 2013, the President of the United States was **Barack Obama**. He was born in **Hawaii**.

The capital of the state of Hawaii is **Honolulu**.

**Answer: Honolulu**.<|im_end|>
