In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

checkpoint = "HuggingFaceTB/SmolLM-135M-Instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if hasattr(torch, 'mps') and torch.mps.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
print(model)

messages = [{"role": "user", "content": "What is gravity?"}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0]))

for k,v in model.state_dict().items():
    print(k, v.shape)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576, padding_idx=2)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb)

In [3]:
# # Print tokenizer configuration
# print("Tokenizer Configuration:")
# print(tokenizer.config)
print("Special Tokens Map:")
print(tokenizer.special_tokens_map)

# Print model configuration
print("\nModel Configuration:")
print(model.config)

Special Tokens Map:
{'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}

Model Configuration:
LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 576,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 9,
  "num_hidden_layers": 30,
  "num_key_value_heads": 3,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "use_cache": true,
  "vocab_size": 49152
}



# Build MiniLM Model


## Build Model Using LlamaConfig
As there is no official smolLM model architecture in huggingface yet, we can build it based on huggingface Llama implementations. Huggingface implementation is very modulized and we can easily make our own customized modules.

Below example shows how to customize one transformer layer by modifying the MLP component, replacing the default SwiGLU (gated SiLU activation) with a simpler ReLU-based MLP. This demonstrates how to alter Hugging Face models using nn.Module.

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import LlamaConfig, AutoConfig
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from transformers import LlamaForCausalLM

config = AutoConfig.from_pretrained(checkpoint)
print(config)

# Updated configuration
config = LlamaConfig(
    vocab_size=20000,
    hidden_size=576,
    intermediate_size=1536,
    num_hidden_layers=30,
    num_attention_heads=9,
    num_key_value_heads=3,
    max_position_embeddings=2048
)
print(config)

# Custom MLP (unchanged)
class CustomMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
        self.act_fn = F.relu  # ReLU instead of SwiGLU

    def forward(self, x):
        x = self.up_proj(x)
        x = self.act_fn(x)
        x = self.down_proj(x)
        return x

# Custom Decoder Layer (unchanged)
class CustomDecoderLayer(LlamaDecoderLayer):
    def __init__(self, config):
        super().__init__(config)
        self.mlp = CustomMLP(config)  # Replace default MLP

# MiniLM with one custom layer
class MiniLM(LlamaForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        # self.model.layers[9] = CustomDecoderLayer(config)  # 10th layer (index 9)

# Instantiate the model
model = MiniLM(config)
print(model)

LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 576,
  "initializer_range": 0.041666666666666664,
  "intermediate_size": 1536,
  "is_llama_config": true,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 9,
  "num_hidden_layers": 30,
  "num_key_value_heads": 3,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_interleaved": false,
  "rope_scaling": null,
  "rope_theta": 100000,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers.js_config": {
    "kv_cache_dtype": {
      "fp16": "float16",
      "q4f16": "float16"
    }
  },
  "transformers_version": "4.50.0",
  "use_cache": true,
  "vocab_size": 49152
}

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 

In [None]:
# You can also convert to MOE layer

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.models.llama.modeling_llama import LlamaDecoderLayer


class MoELayer(nn.Module):
    def __init__(self, hidden_size, num_experts, expert_size):
        super().__init__()
        self.num_experts = num_experts
        # Define a list of expert networks
        self.experts = nn.ModuleList([nn.Linear(hidden_size, expert_size) for _ in range(num_experts)])
        # Gating network to assign weights to experts
        self.gate = nn.Linear(hidden_size, num_experts)

    def forward(self, x):
        # Compute gating scores (softmax over expert dimension)
        gate_scores = F.softmax(self.gate(x), dim=-1)  # Shape: [batch_size, num_experts]

        # Compute outputs from all experts
        expert_outputs = [expert(x) for expert in self.experts]  # List of [batch_size, expert_size]
        expert_outputs = torch.stack(expert_outputs, dim=1)  # Shape: [batch_size, num_experts, expert_size]

        # Weighted combination of expert outputs
        output = torch.einsum('bse,bs->be', expert_outputs, gate_scores)  # Shape: [batch_size, expert_size]
        return output


class CustomMoEDecoderLayer(LlamaDecoderLayer):
    def __init__(self, config, num_experts=4, expert_size=2048):
        super().__init__(config)
        # Replace the MLP (FFN) with MoE
        self.mlp = MoELayer(config.hidden_size, num_experts, expert_size)

## Build Model from Scratch
Build based on current LlamaConfig implementation is not challenge to you? Let's build from scratch! There are already many very good resources building small llm from scratch, for example [nanogpt](https://github.com/karpathy/build-nanogpt/tree/master?tab=readme-ov-file), [tinyllama](https://github.com/jzhang38/TinyLlama/tree/main), [minimind](https://github.com/jingyaogong/minimind/tree/master) and more. Here we will step on their shoulders and build our own miniLM based on modified smolLM structure. We will still build on huggingface, which allow us to leverage trainer to train our models.

### Pull the smolLM weights
We first pull the smolLM weights, and we will copy the weights to our miniLM implementations to make sure the model structure is aligned. See details in the `smol_model_copy_wrights.py`. We will use `smol_model.py` with the same model architecture but without copy the weights for model experiment.

In [5]:
from smol_model import initialize_model, run_test

model, tokenizer, device = initialize_model()

run_test(model, "SmolLM-135M-Instruct", tokenizer, device, "Hello, how are you?")

PyTorch Version: 2.6.0+cu124
Python Version: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
--- Loading HuggingFaceTB/SmolLM-135M-Instruct and preparing custom model for comparison ---
--- Starting Comparison Setup for HuggingFaceTB/SmolLM-135M-Instruct ---
Loading tokenizer from HuggingFaceTB/SmolLM-135M-Instruct
Target device: cpu

STEP 1: Loading official model with accelerate optimizations...
STEP 1 SUCCESS: Official model loaded.
  Official Model Device Map: {'': 'cpu'}

STEP 2: Initializing custom SmolLM model on CPU...
Initialized SmolLMConfig: GQA with 3 groups.
Tied input and output embedding weights.
STEP 2 SUCCESS: Custom model initialized successfully on CPU.

STEP 3: Transferring weights to custom model (on CPU)...
  Copying embed_tokens...
  Copying layer 29/29...
  Layer copies finished.
  Copying final norm...
  Ensuring weights are tied post-copy...
Tied input and output embedding weights.
STEP 3 SUCCESS: Weight transfer complete (on CPU).

STEP 4: Moving custom mo

In [None]:
!pip install datasets
!pip install safetensors



In [None]:
from datasets import load_dataset

dataset = load_dataset("tiny_shakespeare")  # Example dataset
tokenized_dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=2048), batched=True)
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["test"]

# for i in train_dataset:
#   print(i.keys(), len(i["input_ids"]))
#   break

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir="./minillm_output",
    evaluation_strategy="epoch",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_steps=500,
    save_safetensors=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)
trainer.train()

# # Save the trained model
# model.save_pretrained("./minillm_output")



Epoch,Training Loss,Validation Loss
1,No log,3.04029


TrainOutput(global_step=1, training_loss=3.022372007369995, metrics={'train_runtime': 14.0757, 'train_samples_per_second': 0.071, 'train_steps_per_second': 0.071, 'total_flos': 0.0, 'train_loss': 3.022372007369995, 'epoch': 1.0})

In [None]:
# Next: better init.
# mixed precision training
# torch.compile
# flash attention. Done
# change ugly numbers

# Optimization:
# adam init
# norm clipping
# cosine decay learning rate with warm up
# weight decay. not include i-d tensors. fused adam
# bs = 0.5M through gradient accumulate