# Finetune qwen model using unsloth
The final model should talk as Dante Alighieri would do.

In [1]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,
    load_in_4bit = True,
    full_finetuning = False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Qwen3 patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# Add LoRA adapter
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.7.3 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [3]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset, Dataset
import pandas as pd

dataset = load_dataset("json", data_files="../data/*.jsonl", split="train")

def standardize_data_formats_custom(example):
  instruction = {'content': example["instruction"], 'role': 'user'}
  response = {'content': example["response"], 'role': 'assistant'}
  return {"conversations":[instruction, response], }

original_columns = dataset.column_names
dataset = dataset.map(standardize_data_formats_custom, remove_columns=original_columns)

Map:   0%|          | 0/654 [00:00<?, ? examples/s]

In [4]:
dataset[0]

{'conversations': [{'content': 'How do I deal with stress?', 'role': 'user'},
  {'content': "Se l'affanno ti preme e fa dimora / nel petto tuo, come fiera che rugge, / non cercar fuga in ciò che ancor devora. / Pianta lo piè fermo, e con mente che fugge / il vano e lasso mondo, contempla il foco / che purga l'alma e il cor saldo riluce. / Medita il ciel, e alzati poco a poco / da l'onda che ti sbatte senza riva; / ché sol chi spera in Dio, mai non è fioco.",
   'role': 'assistant'}]}

In [5]:
print(dataset[0])
dataset_training = pd.DataFrame(dataset)
dataset_training['text'] = dataset_training['conversations'].apply(
    lambda conv: tokenizer.apply_chat_template(
        conv,
        tokenize=False,
        add_generation_prompt=False
    )
)
text_dataset = Dataset.from_pandas(dataset_training[['text']])

{'conversations': [{'content': 'How do I deal with stress?', 'role': 'user'}, {'content': "Se l'affanno ti preme e fa dimora / nel petto tuo, come fiera che rugge, / non cercar fuga in ciò che ancor devora. / Pianta lo piè fermo, e con mente che fugge / il vano e lasso mondo, contempla il foco / che purga l'alma e il cor saldo riluce. / Medita il ciel, e alzati poco a poco / da l'onda che ti sbatte senza riva; / ché sol chi spera in Dio, mai non è fioco.", 'role': 'assistant'}]}


In [38]:
text_dataset

Dataset({
    features: ['text'],
    num_rows: 654
})

In [6]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = text_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        report_to = "none", # Use this for WandB etc
        dataset_num_proc=2,
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/654 [00:00<?, ? examples/s]

In [7]:
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

We are given the equation:

$$
(x + 2)^2 = 0
$$

### Step 1: Take the square root of both sides

$$
\sqrt{(x + 2)^2} = \sqrt{0}
$$

$$
|x + 2| = 0
$$

### Step 2: Solve the absolute value equation

$$
x + 2 = 0
$$

$$
x = -2
$$

### ✅ Final Answer:

$$
\boxed{-2}
$$

This is the only solution to the equation.<|im_end|>


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 654 | Num Epochs = 1 | Total steps = 41
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 128,450,560 of 14,896,757,760 (0.86% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.7862
2,3.8416
3,3.9426
4,3.4104
5,3.2715
6,3.056
7,2.8025
8,2.7142
9,2.6362
10,2.5332


In [None]:
from transformers import TextStreamer

class CustomNewlineStreamer(TextStreamer):
    def __init__(self, tokenizer, skip_prompt=False, **decode_kwargs):
        super().__init__(tokenizer, skip_prompt, **decode_kwargs)

    def put(self, value):
        """Override the put method to add newlines when '/' is encountered"""
        if len(value.shape) > 1 and value.shape[0] > 1:
            raise ValueError("TextStreamer only supports batch size 1")
        elif len(value.shape) > 1:
            value = value[0]

        if self.skip_prompt and self.next_tokens_are_prompt:
            self.next_tokens_are_prompt = False
            return

        # Decode the tokens
        text = self.tokenizer.decode(value, skip_special_tokens=True, **self.decode_kwargs)

        if self.skip_prompt and not self.next_tokens_are_prompt:
            # Remove the prompt part if this is the first chunk
            if hasattr(self, 'prompt_length'):
                text = text[self.prompt_length:]
            self.skip_prompt = False

        # Replace '/' with '/\n' to add newline after each '/'
        text = text.replace('/', '\n')

        # Print the text
        print(text, end='', flush=True)

In [14]:
messages = [
    {"role" : "user", "content" : "Qual è il senso della vita?"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = CustomNewlineStreamer(tokenizer, skip_prompt=True),
)

Il senso della vita è il fine e l'opra 
 che l'ingegno e l'amor ci fanno compiere, 
 per raggiunger la somma beatitudine. 
 È come un'arco che mira a la luce, 
 e ogni freccia che vola nel cielo 
 è un atto d'amor che cerca la sua fonte.


## Load the model from HF and inference

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "Daniele/qweDante",
        max_seq_length = 2048,
        load_in_4bit = True,
    )

==((====))==  Unsloth 2025.6.12: Fast Qwen3 patching. Transformers: 4.53.0.
   \\   /|    NVIDIA RTX A1000 6GB Laptop GPU. Num GPUs = 1. Max memory: 5.801 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
