In [1]:
# Import required libraries for fine-tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
# import wandb  # Optional: for experiment tracking

# Initialize Weights & Biases (optional)
# wandb.init(project="smollm3-finetuning")

# Load SmolLM3 base model for fine-tuning
model_name = "HuggingFaceTB/SmolLM3-3B-Base"
new_model_name = "SmolLM3-Custom-SFT"

print(f"Loading {model_name}...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
tokenizer.padding_side = "right"  # Padding on the right for generation

print(f"Model loaded! Parameters: {model.num_parameters():,}")

  from .autonotebook import tqdm as notebook_tqdm


Loading HuggingFaceTB/SmolLM3-3B-Base...


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:17<00:00,  8.97s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


Model loaded! Parameters: 3,075,098,624


In [2]:
instruct_model_name = "HuggingFaceTB/SmolLM3-3B"

# Load tokenizers
instruct_tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)

In [19]:
# Load and prepare training dataset
print("=== PREPARING DATASET ===\n")

# Option 1: Use SmolTalk2 (recommended for beginners)
dataset = load_dataset("HuggingFaceTB/smoltalk2", "SFT", streaming=True)
train_dataset = Dataset.from_list(list(dataset["smoltalk_everyday_convs_reasoning_Qwen3_32B_think"].take(1000)))  # Use subset for faster training

# Option 2: Use your own processed dataset from Exercise 2
# train_dataset = gsm8k_formatted.select(range(500))

print(f"Training examples: {len(train_dataset)}")
print(f"Example: {train_dataset[0]}")

# Prepare the dataset for SFT
def format_chat_template(example):
    """Format the messages using the chat template"""
    if "messages" in example:
        # SmolTalk2 format
        messages = example["messages"]
    else:
        # Custom format - adapt as needed
        messages = [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["response"]}
        ]
    
    # Apply chat template
    text = instruct_tokenizer.apply_chat_template(
        messages, 
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": text}

# Apply formatting
formatted_dataset = train_dataset.map(format_chat_template)
formatted_dataset = formatted_dataset.remove_columns(
    [col for col in formatted_dataset.column_names if col != "text"]
)
print(f"Formatted example: {formatted_dataset[0]['text'][:400]}...")

=== PREPARING DATASET ===

Training examples: 1000
Example: {'messages': [{'content': 'Hi there', 'role': 'user'}, {'content': '<think>\nOkay, the user sent "Hi there". That\'s a friendly greeting. I should respond in a welcoming way. Let me check the guidelines. I need to be helpful, keep the conversation going, and maybe ask how I can assist them. Let me make sure the tone is warm and approachable. Alright, something like "Hello! How can I assist you today?" That should work. Let me confirm there\'s no typo and it\'s in a natural, conversational style.\n</think>\n\nHello! How can I assist you today?', 'role': 'assistant'}, {'content': "I'm looking for a healthy breakfast idea. What's a good option?", 'role': 'user'}, {'content': "<think>\nOkay, the user is asking for a healthy breakfast idea. Let me think about what makes a breakfast healthy. It should be balanced, providing a mix of nutrients like protein, fiber, healthy fats, and some carbs. Let me brainstorm some options.\n\nMaybe

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:00<00:00, 2720.65 examples/s]

Formatted example: <|im_start|>system
## Metadata

Knowledge Cutoff Date: June 2025
Today Date: 09 November 2025
Reasoning Mode: /think

## Custom Instructions

You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in...





In [11]:
dataset = load_dataset("HuggingFaceTB/smoltalk2", "SFT", streaming=True)
train_dataset = dataset["smoltalk_everyday_convs_reasoning_Qwen3_32B_think"].take(10)  # Use subset for faster training


In [None]:
from datasets import Dataset


Dataset({
    features: ['messages', 'chat_template_kwargs', 'source'],
    num_rows: 1000
})

In [22]:
# Configure training parameters
training_config = SFTConfig(
    # Model and data
    output_dir=f"./{new_model_name}",
    dataset_text_field="text",
    max_length=2048,
    
    # Training hyperparameters
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=1,  # Start with 1 epoch
    max_steps=500,  # Limit steps for demo
    
    # Optimization
    warmup_steps=50,
    weight_decay=0.01,
    optim="adamw_torch",
    
    # Logging and saving
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    save_total_limit=2,
    
    # Memory optimization
    dataloader_num_workers=0,
    group_by_length=True,  # Group similar length sequences
    
    # Hugging Face Hub integration
    push_to_hub=False,  # Set to True to upload to Hub
    hub_model_id=f"your-username/{new_model_name}",
    
    # Experiment tracking
    report_to=["trackio"],  # Use trackio for experiment tracking
    run_name=f"{new_model_name}-training",
    completion_only_loss=True, # Compute loss only on generated tokens
)

print("Training configuration set!")
print(f"Effective batch size: {training_config.per_device_train_batch_size * training_config.gradient_accumulation_steps}")

Training configuration set!
Effective batch size: 4


In [24]:
model

SmolLM3ForCausalLM(
  (model): SmolLM3Model(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0-35): 36 x SmolLM3DecoderLayer(
        (self_attn): SmolLM3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): SmolLM3MLP(
          (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): SmolLM3RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): SmolLM3RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Smo

In [26]:
# LoRA configuration with PEFT
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Create SFTTrainer with LoRA enabled
from trl import SFTTrainer

lora_trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,  # dataset with a "text" field or messages + dataset_text_field in config
    args=training_config,
    peft_config=peft_config,  # << enable LoRA

)

print("Starting LoRA trainingâ€¦")
lora_trainer.train()

Adding EOS to train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:00<00:00, 1593.02 examples/s]
Tokenizing train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:04<00:00, 210.22 examples/s]
Truncating train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1000/1000 [00:00<00:00, 43679.29 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': None}.


Starting LoRA trainingâ€¦
* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: fmcurti/trackio-dataset
* Creating new space: https://huggingface.co/spaces/fmcurti/trackio
* View dashboard by going to: https://fmcurti-trackio.hf.space/


* Created new run: SmolLM3-Custom-SFT-training


RuntimeError: Function MmBackward0 returned an invalid gradient at index 1 - expected device meta but got cuda:0