In [None]:
# Do this only in Colab notebooks! Otherwise use pip install unsloth
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install --no-deps unsloth

# ignore the errors

In [None]:
# Import all of the libraries
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
from datasets import Dataset
from transformers import TextStreamer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig


In [None]:
# Load the model and tokenizer from Hugging Face using FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen3-0.6B",
    max_seq_length = 1024,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory (QLoRA)
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

In [None]:
def generate_with_thinking_split(prompt, model, tokenizer, max_new_tokens=32768):
    """
    Generate a response from the model with optional 'thinking' mode separation.

    Args:
        prompt (str): The user prompt.
        model: The language model.
        tokenizer: The tokenizer corresponding to the model.
        max_new_tokens (int): Maximum number of new tokens to generate.

    Returns:
        tuple[str, str]: (thinking_content, final_content)
    """
    # Prepare messages for chat template
    messages = [{"role": "user", "content": prompt}]

    # Format input using the chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=True
    )

    # Tokenize and move to device
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # Generate text
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens
    )

    # Extract only new tokens (i.e. the output)
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

    # Define </think> token ID
    THINK_END_TOKEN_ID = 151668

    # Try to find where </think> ends
    try:
        index = len(output_ids) - output_ids[::-1].index(THINK_END_TOKEN_ID)
    except ValueError:
        index = 0

    # Decode both parts
    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

    print(f"Thinking:\n {thinking_content}")
    print(f"Content:\n {content}")
    return


In [None]:
# Example usage
generate_with_thinking_split("Give me a short introduction to large language model.", model, tokenizer)

In [None]:
# Apply PEFT (Parameter-Efficient Fine-Tuning) using LoRA to the base model
model = FastLanguageModel.get_peft_model(
    model, # previously loaded model
    r = 8,           # LoRA Matrix size - Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], # modules to fine-tune
    lora_alpha = 16,  # Best to choose alpha = rank or rank*2 - a scaling factor for updates. The higher the value, the more aggressive the updates.
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

In [None]:
'''
Prepare the dataset for training
- Load the dataset
- Format the dataset using the chat template
- Select a subset of the dataset for training and evaluation 
'''

# Set the tokenizer to use a specific chat template format (e.g., ChatML)
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

# Function to format each conversation into a text string using the chat template
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# Load dataset in ShareGPT format
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")

# Apply formatting function to each example
dataset_pre = dataset.map(formatting_prompts_func, batched = True)


# Select only 100 examples for training
dataset = dataset_pre.select(range(100))

# Select only 20 example for evaluation
dataset_eval = dataset_pre.select(range(100,120))

print("Dataset dimension:", len(dataset))
print("Eval Dataset dimension:", len(dataset_eval))


In [None]:
# print the training dataset
for i in range(0, 100):
  print(dataset[i])

In [None]:
# Initialize the supervised fine-tuning (SFT) trainer with model, tokenizer, data, and training config
trainer = SFTTrainer(
    model = model, # LoRA MODEL
    tokenizer = tokenizer, # Tokenizer
    train_dataset = dataset, # Training data
    eval_dataset=dataset_eval, # Evaluation data
    args = SFTConfig( # Training configuration
        dataset_text_field = "text", # Field in dataset that contains the input text
        per_device_train_batch_size = 2, # Batch size per device (GPU)
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 1, # Number of warmup steps
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 20, # Total number of training set
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1, # Log after every step
        optim = "adamw_8bit", # Optimizer
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps", # evaluate every eval_steps
        eval_steps = 2
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Execute the training and check the loss
trainer_stats = trainer.train()

In [None]:
# Example prompt to test the final model
generate_with_thinking_split("Make a short poetry on LLMs", model, tokenizer)
