In [None]:
#step1
# pip install -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch05/07_gpt_to_llama/requirements-extra.txt

In [None]:
from importlib.metadata import version
pkgs = [
    "huggingface_hub",
    "tokenizers",
    "torch",
        ]
for p in pkgs:
  print(f"{p} version:{version(p)}")

huggingface_hub version:0.34.4
tokenizers version:0.22.0
torch version:2.8.0+cu126


In [None]:
USE_REASONING_MODEL = True

In [None]:
import torch
import torch.nn as nn
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype = cfg["dtype"], bias = False)
        self.fc2 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype = cfg["dtype"], bias = False)
        self.fc3 = nn.Linear(cfg["hidden_dim"], cfg["emb_dim"], dtype = cfg["dtype"], bias = False)

    def forward(self,x):
      x_fc1 = self.fc1(x)
      x_fc2 = self.fc2(x)
      x = nn.functional.silu(x_fc1) * x_fc2
      return self.fc3(x)

In [None]:
class RMSNorm(nn.Module):
  def __init__(self, emb_dim,eps = 1e-6,bias = False,qwen3_compatible = True):
    super().__init__()
    self.eps = eps
    self.qwen3_compatible = qwen3_compatible
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.ones(emb_dim)) if bias else None

  def forward(self,x):
    input_dtype = x.dtype
    if self.qwen3_compatible:
      x = x.to(torch.float32)
    variance = x.pow(2).mean(-1,keepdim = True)
    norm_x = x * torch.rsqrt(variance + self.eps)
    norm_x = x * self.scale

    if self.shift is not None:
      norm_x = norm_x + self.shift
    return norm_x.to(input_dtype)

In [None]:
#rope - rotatory position embeddings - rope captures relatve position information information
#use position interpolation increase the inference context length .... use a scaling factor
#NTK - adaptive frequency scaling method - use 1 as scaling factor for high frequency tokens , use 1/n as the factor for low frequency tokens
#YARN - adaptive frequency scaling method -
#adaptive frequency method increases the context length of inference window of transformer , even when transformer is trained at lower context length
#NTK helps with 64K context lenght and also helps in find needle in haystack evaluation for LLM

def compute_rope_params(head_dim,theta_base = 10_000,context_length = 4096, dtype= torch.float32):
  assert head_dim % 2 == 0, "Embedding dimnesion must be even"

  #compute the inverse frequencies #rotation frequency
  inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2, dtype = dtype)[:(head_dim//2)].float()/head_dim))

  #generate position indices
  positions = torch.arange(context_length, dtype = dtype)

  #Compute the angles
  angles = positions[:,None]*inv_freq[None,:] #shape (context_length , head_dim//2)
  #expand angles to match the head_dim
  angles = torch.cat([angles, angles], dim = 1) #shape (context_length, head_dim)
  #precompute sine and cosine
  cos = torch.cos(angles)
  sin = torch.sin(angles)
  return cos,sin

def apply_rope(x,cos,sin):
    #x: (batch_size,num_heads,seq_len,head_dim)
    batch_size , num_heads, seq_len, head_dim = x.shape
    assert head_dim %2 ==0 , "head_dim must be even"

    #split x into first half and second half
    x1 = x[...,:head_dim//2] #first half
    x2 = x[...,head_dim//2:] #second half

    #adjust sin and cos shapes
    cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) #shape (1,1,seq_len,head_dim)
    sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)

    #apply the rotatory transformation
    rotated = torch.cat((-x2,x1),dim=-1)
    x_rotated = (x*cos) + (rotated*sin)

    #Its ok to use Lower precision after applying cos and sin rotation
    return x_rotated.to(dtype=x.dtype)

In [None]:
#GQA
class GroupedQueryAttention(nn.Module):
  def __init__(
      self,d_in,num_heads,num_kv_groups,head_dim=None, qk_norm = False , dtype = None
      ): #multiple query heads share same key and value projections
      #the benefit of GQA is faster inference and better understanding on the text ...
      #no of heads = no of group of queries....
    super().__init__()
    assert num_heads % num_kv_groups == 0, "num_heads must be divisible by num_kv_groups"
    self.num_heads = num_heads
    self.num_kv_groups = num_kv_groups
    self.group_size = num_heads // num_kv_groups #

    if head_dim is None:
      assert d_in % num_heads == 0, "d_in must be divisible by num_heads if head_dim is not specified"
      head_dim = d_in // num_heads

    self.head_dim = head_dim
    self.d_out = num_heads*head_dim

    self.W_query = nn.Linear(d_in, self.d_out, bias = False, dtype = dtype)
    self.W_key = nn.Linear(d_in, num_kv_groups*head_dim, bias = False, dtype = dtype)
    self.W_value = nn.Linear(d_in, num_kv_groups*head_dim, bias = False, dtype = dtype)

    self.out_proj = nn.Linear(self.d_out, d_in, bias = False, dtype = dtype) #o_proj -> output projection in case of MHA
    if qk_norm:
      self.q_norm = RMSNorm(head_dim, eps = 1e-6)
      self.k_norm = RMSNorm(head_dim, eps = 1e-6)
    else:
      self.q_norm = self.k_norm = None

  def forward(self,x,mask,cos,sin):
    b, num_tokens, _ = x.shape

    #apply projections #expand # linear transformations
    queries = self.W_query(x) #(b,num_tokens,num_heads*head_dim) #projections - this projections helps to capture different aspects of text
    keys = self.W_key(x)  #projections
    values = self.W_value(x) #projections

    #queries are getting projected to more dimensions and this helps....

    #Reshape
    queries = queries.view(b,num_tokens,self.num_heads,self.head_dim).transpose(1,2)
    keys = keys.view(b,num_tokens,self.num_kv_groups,self.head_dim).transpose(1,2)
    values = values.view(b,num_tokens,self.num_kv_groups,self.head_dim).transpose(1,2)

    #Optional Normalization
    if self.q_norm:
      queries = self.q_norm(queries)
    if self.k_norm:
      keys = self.k_norm(keys)

    #Apply RoPE
    queries = apply_rope(queries,cos,sin)
    keys = apply_rope(keys,cos,sin)

    #Expand K and V to match query
    keys = keys.repeat_interleave(self.group_size,dim = 1)
    values = values.repeat_interleave(self.group_size,dim = 1)

    #Attention
    attn_scores = queries @ keys.transpose(2,3)
    attn_scores = attn_scores.masked_fill(mask,-torch.inf)
    attn_weights = torch.softmax(attn_scores/self.head_dim**0.5,dim = -1)
    context = (attn_weights @ values).transpose(1,2).reshape(b,num_tokens,self.d_out)
    return self.out_proj(context) # compression layer to compress output from various attention head layers....

In [None]:
#class Transformer
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.att=GroupedQueryAttention(
        d_in = cfg["emb_dim"],
        num_heads = cfg["n_heads"],
        head_dim = cfg["head_dim"],
        num_kv_groups = cfg["n_kv_groups"],
        qk_norm = cfg["qk_norm"],
        dtype = cfg["dtype"]
    )
    self.ff = FeedForward(cfg)
    self.norm1 = RMSNorm(cfg["emb_dim"],eps = 1e-6)
    self.norm2 = RMSNorm(cfg["emb_dim"],eps = 1e-6)

  def forward(self,x,mask,cos,sin):
      #Shortcut connection for attention block
      shortcut = x
      x = self.norm1(x)
      x = self.att(x,mask,cos,sin)
      x = x + shortcut

      #Shortcut connection for feed-forward block
      shortcut = x
      x = self.norm2(x)
      x = self.ff(x)
      x = x + shortcut
      return x

In [None]:
#Qwen3.0 Model
class Qwen3Model(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    #Main model parameters
    self.tok_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"] , dtype = cfg["dtype"])
    self.trf_blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
    self.final_norm = RMSNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias = False , dtype = cfg["dtype"])

    #Resuable utilities
    if cfg["head_dim"] is None:
      head_dim = cfg["emb_dim"]//cfg["n_heads"]
    else:
      head_dim = cfg["head_dim"]
    cos, sin = compute_rope_params(head_dim = head_dim, theta_base = cfg["rope_base"] ,context_length = cfg["context_length"])
    self.register_buffer("cos",cos,persistent = False)
    self.register_buffer("sin",sin, persistent = False)
    self.cfg = cfg

  def forward(self,in_idx):
    #Forward pass
    tok_embeds = self.tok_emb(in_idx)
    x = tok_embeds
    num_tokens = x.shape[1]
    mask = torch.triu(torch.ones(num_tokens,num_tokens,device=x.device , dtype = torch.bool),diagonal = 1)

    for block in self.trf_blocks:
      x = block(x,mask,self.cos,self.sin)
    x = self.final_norm(x)
    logits = self.out_head(x.to(self.cfg["dtype"]))
    return logits

In [None]:
#Intialize Model
CHOOSE_MODEL = "0.6B"

if CHOOSE_MODEL == "0.6B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,           # Vocabulary size
        "context_length": 40_960,        # Context length that was used to train the model
        "emb_dim": 1024,                 # Embedding dimension
        "n_heads": 16,                   # Number of attention heads
        "n_layers": 28,                  # Number of layers
        "hidden_dim": 3072,              # Size of the intermediate dimension in FeedForward
        "head_dim": 128,                 # Size of the heads in GQA
        "qk_norm": True,                 # Whether to normalize queries and keys in GQA
        "n_kv_groups": 8,                # Key-Value groups for grouped-query attention
        "rope_base": 1_000_000.0,        # The base in RoPE's "theta"
        "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage
    }

elif CHOOSE_MODEL == "1.7B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,
        "context_length": 40_960,
        "emb_dim": 2048,                 # 2x larger than above
        "n_heads": 16,
        "n_layers": 28,
        "hidden_dim": 6144,              # 2x larger than above
        "head_dim": 128,
        "qk_norm": True,
        "n_kv_groups": 8,
        "rope_base": 1_000_000.0,
        "dtype": torch.bfloat16,
    }

elif CHOOSE_MODEL == "4B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,
        "context_length": 40_960,
        "emb_dim": 2560,                 # 25% larger than above
        "n_heads": 32,                   # 2x larger than above
        "n_layers": 36,                  # 29% larger than above
        "hidden_dim": 9728,              # ~3x larger than above
        "head_dim": 128,
        "qk_norm": True,
        "n_kv_groups": 8,
        "rope_base": 1_000_000.0,
        "dtype": torch.bfloat16,
    }

elif CHOOSE_MODEL == "8B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,
        "context_length": 40_960,
        "emb_dim": 4096,                 # 60% larger than above
        "n_heads": 32,
        "n_layers": 36,                  # 26% larger than above
        "hidden_dim": 12288,
        "head_dim": 128,
        "qk_norm": True,
        "n_kv_groups": 8,
        "rope_base": 1_000_000.0,
        "dtype": torch.bfloat16,
    }

elif CHOOSE_MODEL == "14B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,
        "context_length": 40_960,
        "emb_dim": 5120,                 # 25% larger than above
        "n_heads": 40,                   # 25% larger than above
        "n_layers": 40,                  # 11% larger than above
        "hidden_dim": 17408,             # 42% larger than above
        "head_dim": 128,
        "qk_norm": True,
        "n_kv_groups": 8,
        "rope_base": 1_000_000.0,
        "dtype": torch.bfloat16,
    }

elif CHOOSE_MODEL == "32B":
    QWEN3_CONFIG = {
        "vocab_size": 151_936,
        "context_length": 40_960,
        "emb_dim": 5120,
        "n_heads": 64,                   # 60% larger than above
        "n_layers": 64,                  # 60% larger than above
        "hidden_dim": 25600,             # 47% larger than above
        "head_dim": 128,
        "qk_norm": True,
        "n_kv_groups": 8,
        "rope_base": 1_000_000.0,
        "dtype": torch.bfloat16,
    }

else:
    raise ValueError(f"{CHOOSE_MODEL} is not supported.")

In [None]:
#Intializing Model
torch.manual_seed(123)
model = Qwen3Model(QWEN3_CONFIG)

In [None]:
model

Qwen3Model(
  (tok_emb): Embedding(151936, 1024)
  (trf_blocks): ModuleList(
    (0-27): 28 x TransformerBlock(
      (att): GroupedQueryAttention(
        (W_query): Linear(in_features=1024, out_features=2048, bias=False)
        (W_key): Linear(in_features=1024, out_features=1024, bias=False)
        (W_value): Linear(in_features=1024, out_features=1024, bias=False)
        (out_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=1024, out_features=3072, bias=False)
        (fc2): Linear(in_features=1024, out_features=3072, bias=False)
        (fc3): Linear(in_features=3072, out_features=1024, bias=False)
      )
      (norm1): RMSNorm()
      (norm2): RMSNorm()
    )
  )
  (final_norm): RMSNorm()
  (out_head): Linear(in_features=1024, out_features=151936, bias=False)
)

In [None]:
model(torch.tensor([1,2,3]).unsqueeze(0))

tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]]], dtype=torch.bfloat16,
       grad_fn=<UnsafeViewBackward0>)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters:{total_params:,}")
#account for weight tying
total_params_normalized = total_params - model.tok_emb.weight.numel()
print(f"\nTotal number of unique parameters:{total_params_normalized:,}")

Total number of parameters:751,632,384

Total number of unique parameters:596,049,920


In [None]:
#
def model_memory_size(model,input_dtype=torch.float32):
    total_params = 0
    total_grads = 0
    for param in model.parameters():
        #calculate total number of elements per parameter
        param_size = param.numel()
        total_params += param_size
        #check if gradients are stored for this paramter
        if param.requires_grad:
            total_grads += param_size
    # calculate buffer size (non-parameters that require memory)
    total_buffers = sum(buf.numel() for buf in model.buffers())
    # Size in bytes = (Number of elements) * (Size of each element in bytes)
    # We assume paramaters and gradients are stored in the same type as input dtype
    element_size = torch.tensor(0,dtype=input_dtype).element_size()
    total_memory_bytes = (total_params + total_grads + total_buffers) * element_size
    #Convert bytes to gigabytes
    total_memory_gb = total_memory_bytes/(1024**3)
    return total_memory_gb

print(f"float32(PyTorch Default):{model_memory_size(model,input_dtype=torch.float32):.2f} GB")
print(f"bfloat16:{model_memory_size(model,input_dtype=torch.bfloat16):.2f} GB")

float32(PyTorch Default):5.64 GB
bfloat16:2.82 GB


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model.to(device);

In [None]:
#Load Pretrained weights
def load_weights_into_qwen(model, param_config, params):
  def assign(left,right,tensor_name = "unknown"):
    if left.shape != right.shape:
      raise ValueError(f"Shape mismatch in tensor '{tensor_name}'.Left :{left.shape},Right :{right.shape}")
    return torch.nn.Parameter(right.clone().detach() if isinstance(right,torch.Tensor) else torch.tensor(right))
  model.tok_emb.weight = assign(model.tok_emb.weight, params["model.embed_tokens.weight"], "model.embed_tokens.weight")

  for l in range(param_config["n_layers"]):
    block = model.trf_blocks[1]
    att = block.att

  #Q,K,V projections
  att.W_query.weight = assign(att.W_query.weight, params[f"model.layers.{l}.self_attn.q_proj.weight"], f"model.layers.{l}.self_attn.q_proj.weight")
  att.W_key.weight = assign(att.W_key.weight, params[f"model.layers.{l}.self_attn.k_proj.weight"], f"model.layers.{l}.self_attn.k_proj.weight")
  att.W_value.weight = assign(att.W_value.weight, params[f"model.layers.{l}.self_attn.v_proj.weight"], f"model.layers.{l}.self_attn.v_proj.weight")

  #Output projection
  att.out_proj.weight = assign(att.out_proj.weight, params[f"model.layers.{l}.self_attn.o_proj.weight"], f"model.layers.{l}.self_attn.o_proj.weight")

  #QK norms
  if hasattr(att,"q_norm") and att.q_norm is not None:
    att.q_norm.scale = assign(att.q_norm.scale, params[f"model.layers.{l}.self_attn.q_norm.weight"], f"model.layers.{l}.self_attn.q_norm.weight")
  if hasattr(att,"k_norm") and att.k_norm is not None:
    att.k_norm.scale = assign(att.k_norm.scale, params[f"model.layers.{l}.self_attn.k_norm.weight"], f"model.layers.{l}.self_attn.k_norm.weight")

  #Attention layernorm
  block.norm1.scale = assign(block.norm1.scale, params[f"model.layers.{l}.input_layernorm.weight"], f"model.layers.{l}.mlp.gate_proj.weight")
  #feed forward weights
  block.ff.fc1.weight = assign(block.ff.fc1.weight, params[f"model.layers.{l}.mlp.gate_proj.weight"], f"model.layers.{l}.mlp.gate_proj.weight")
  block.ff.fc2.weight = assign(block.ff.fc2.weight, params[f"model.layers.{l}.mlp.down_proj.weight"], f"model.layers.{l}.mlp.down_proj.weight")
  block.ff.fc3.weight = assign(block.ff.fc3.weight, params[f"model.layers.{l}.mlp.up_proj.weight"], f"model.layers.{l}.mlp.up_proj.weight")

  block.norm2.scale = assign(block.norm2.scale, params[f"model.layers.{l}.post_attention_layernorm.weight"], f"model.layers.{l}.post_attention_layernorm.weight")
  #Final normalization and output head
  model.final_norm.scale = assign(model.final_norm.scale, params["model.norm.weight"], "model.norm.weight")
  if "lm_head.weight" in params:
    model.out_head.weight = assign(model.out_head.weight, params["lm_head.weight"], "lm_head.weight")
  else:
    print("Model uses weight tying")
    model.out_head.weight = assign(model.out_head.weight, params["model.tok_emb.weight"], "model.embed_tokens.weight")

In [None]:
#
import json
import os
from pathlib import Path
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download , snapshot_download

if USE_REASONING_MODEL:
  repo_id = f"Qwen/Qwen3-{CHOOSE_MODEL}"
else:
  repo_id = f"Qwen/Qwen3-{CHOOSE_MODEL}-Base"
local_dir = Path(repo_id).parts[-1]

if CHOOSE_MODEL == "0.6B":
  weights_file = hf_hub_download(repo_id = repo_id,filename = "model.safetensors",local_dir = local_dir)
  weight_dic = load_file(weights_file)
else:
  repo_dir= snapshot_download(repo_id= repo_id,local_dir = local_dir)
  index_path = os.path.join(repo_dir,"model.safetensors.index.json")
  with open(index_path,"r") as f:
    index = json.load(f)
  weight_dic = {}
  for filename in set(index["weight_map"].values()):
    shard_path = os.path.join(repo_dir,filename)
    shard = load_file(shard_path)
    weight_dic.update(shard)
#print("weight_dic",weight_dic)
load_weights_into_qwen(model,QWEN3_CONFIG,weight_dic)
model.to(device)
del weight_dic

ValueError: Shape mismatch in tensor 'model.layers.27.mlp.down_proj.weight'.Left :torch.Size([3072, 1024]),Right :torch.Size([1024, 3072])

In [None]:
#load tokenizer
import re
from tokenizers import Tokenizer

class Qwen3Tokenizer:
   _SPECIALS = [
        "<|endoftext|>",
        "<|im_start|>", "<|im_end|>",
        "<|object_ref_start|>", "<|object_ref_end|>",
        "<|box_start|>", "<|box_end|>",
        "<|quad_start|>", "<|quad_end|>",
        "<|vision_start|>", "<|vision_end|>",
        "<|vision_pad|>", "<|image_pad|>", "<|video_pad|>",
    ]
   _SPLIT_RE = re.compile(r"(<\|[^>]+?|>)")
   def __init__(self , tokenizer_file_path = "tokenizer.json",repo_id = None , apply_chat_template = True , add_generation_prompt = False , add_thinking = False):
      self.apply_chat_template = apply_chat_template
      self.add_generation_prompt = add_generation_prompt
      self.add_thinking = add_thinking

      tok_file = Path(tokenizer_file_path)
      self._tok = Tokenizer.from_file(str(tok_file))
      self._special_to_id = {t: self._tok.token_to_id(t) for t in self._SPECIALS}

      self.pad_token_id = self._special_to_id["<|endoftext|>"]
      self.eos_token_id = self.pad_token_id

      if repo_id and "Base" not in repo_id:
        eos_token = "<|im_end|>"
      else:
        eos_token = "<|endoftext|>"
      if eos_token in self._special_to_id:
         self.eos_token_id = self._special_to_id[eos_token]

   def encode(self,text,chat_wrapped):
    if chat_wrapped is None:
      chat_wrapped = self.apply_chat_template

    stripped = text.strip()
    if stripped in self._special_to_id and "\n" not in stripped:
      return [self._special_to_id[stripped]]

    if chat_wrapped:
      text = self._wrap_chat(text)
    ids =[]
    for part in filter(None , self._SPLIT_RE.split(text)):
      if part in self._special_to_id:
        ids.append(self._special_to_id[part])
      else:
        ids.extend(self._tok.encode(part).ids)
    return ids

   def decode(self,ids):
    return self._tok.decode(ids,skip_special_tokens = True)
   def _wrap_chat(self,user_msg):
    s = f"<|im_start|>user\n{user_msg}<|im_end|>\n"
    if self.add_generation_prompt:
      s += "<|im_start|>assistant"
      if self.add_thinking:
        s += "\n"
      else:
        s += "\n<think>\n\n</think>\n\n"
    return s


In [None]:
if USE_REASONING_MODEL:
    tokenizer_file_path = f"Qwen3-{CHOOSE_MODEL}/tokenizer.json"
else:
    tokenizer_file_path = f"Qwen3-{CHOOSE_MODEL}-Base/tokenizer.json"

hf_hub_download(
    repo_id=repo_id,
    filename="tokenizer.json",
    local_dir=local_dir,
)

tokenizer = Qwen3Tokenizer(
    tokenizer_file_path=tokenizer_file_path,
    repo_id=repo_id,
    add_generation_prompt=USE_REASONING_MODEL,
    add_thinking=USE_REASONING_MODEL
)