<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Small_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#developing Small Language Model from Scratch
#scaling laws as the no of parameters of the model increase . Model performance increases as we increase model parameters
#https://www.youtube.com/watch?v=pOFcwcwtv3k&t=24s

In [None]:
#Step 1
#Import the dataset
!pip install datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("roneneldan/TinyStories")

In [None]:
#Step 2
#Data preprocessing for training a LLM
#Tokenize the dataset
#Tokenize the dataset into tokenIDs
#Create a file called "train.bin" and "validation.bin" where we store all the tokenIDs from the entire dataset
#Ensure that tokenIDs are stored on a disk , rather than on the RAM for efficient computations
!pip install tiktoken
import tiktoken
import os
import numpy as np
from tqdm.auto import tqdm

enc = tiktoken.get_encoding("gpt2") #bpe tokenizer
def process(example):
  ids = enc.encode_ordinary(example["text"])
  out = {'ids':ids , 'len': len(ids)}
  return out

if not os.path.exists("train.bin"):
  tokenized = dataset.map(process, remove_columns= ['text'],desc="tokenizing the splits",num_proc=8)
#concatenate all the ids in each dataset into one large file for training
for split , dset in tokenized.items():
  arr_len = np.sum(dset['len'], dtype = np.uint64)
  filename = f"{split}.bin"
  dtype = np.uint16 #using 2 bytes
  arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,)) #memory mapped array
  total_batches = 1024
  idx = 0
  for batch_idx in tqdm(range(total_batches), desc=f"writing {filename}"):
    batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True)
    arr_batch = np.concatenate(batch['ids'])
    #write into map
    arr[idx : idx + len(arr_batch)] = arr_batch
    idx += len(arr_batch)
  arr.flush()
  #one word is not one token

In [None]:
#step 3: create input-output batches from dataset
def get_batch(split):
  #create np.memap every batch to avoid a memory leak as per stackflow
  if split == 'train':
    data = np.memap('train.bin' , dtype = np.uint16 , mode = 'r')
  else:
    data = np.memap('validation.bin' , dtype = np.uint16 , mode = 'r')
  ix = torch.randint(len(data) - block_size , (batch_size,)) #batch size = 4
  x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix]) #block_size is the context size
  y = torch.stack([torch.from_numpy((data[i+1:i+1+ block_size]).astype(np.int64)) for i in ix]) #target shifted by 1 token for prediction
  if device_type  == 'cuda':
    x,y = x.pin_memory().to(device , non_blocking=True) , y.pin_memory().to(device , non_blocking=True)
  else:
    x,y = x.to(device) , y.to(device)
  return x,y

In [None]:
#step 4: define the SLM model architecture
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass
import numpy as np
from tqdm.auto import tqdm
from contextlib import nullcontext
import os

class LayerNorm(nn.Module):
  def __init__(self, ndim, bias):
    super().__init__()
    self.weight = nn.Parameter(torch.ones(ndim))
    self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
  def forward(self, x):
    return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    self.c_attn = nn.Linear(config.n_embd , 3 * config.n_embd , bias = False)
    self.c_proj = nn.Linear(config.n_embd , config.n_embd , bias = False)
    self.attn_dropout = nn.Dropout(config.dropout)
    self.resid_dropout = nn.Dropout(config.dropout)
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.flash = hasattr(F,'scaled_dot_product_attention')
    if not self.flash:
      self.register_buffer('bias',torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

  def forward(self, x):
    B, T, C = x.size()
    q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose
    if self.flash:
      y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.attn_dropout, is_causal=True)
    else:
      att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
      att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
      att = F.softmax(att, dim=-1)
      att = self.attn_dropout(att)
      y = att @ v
    y = y.transpose(1, 2).contiguous().view(B, T, C)
    y = self.resid_dropout(self.c_proj(y))
    return y

class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False) #expansion
    self.gelu = nn.GELU()
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False) #contraction
    self.dropout = nn.Dropout(config.dropout)
  def forward(self,x):
    return self.dropout(self.c_proj(self.gelu(self.c_fc(x))))

class Block(nn.Module): #transformer block
  def __init__(self, config):
    super().__init__()
    self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) #layer normalization layer
    self.attn = CausalSelfAttention(config) #casusal attention
    self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) #layer normalization layer
    self.mlp = MLP(config) #config layer
  def forward(self, x):
    x = x + self.attn(self.ln_1(x)) #shortcut connection
    x = x + self.mlp(self.ln_2(x)) #shortcut connection
    return x

@dataclass
class GPTConfig:
  block_size: int
  vocab_size: int
  n_layer: int
  n_head: int
  n_embd: int
  dropout: float = 0.0
  bias: bool = True

class GPT(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config
    self.transformer = nn.ModuleDict(dict(
      wte = nn.Embedding(config.vocab_size, config.n_embd),
      wpe = nn.Embedding(config.block_size, config.n_embd),
      drop = nn.Dropout(config.dropout),
      h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
      ln_f = LayerNorm(config.n_embd, bias=config.bias),
    ))
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) #output head logits....
    self.transformer.wte.weight = self.lm_head.weight #weight tying

    self.apply(self._init_weights)
    for pn,p in self.named_parameters():
      if pn.endswith('c_proj.weight'):
        torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer)) #parameter intialization using Normal Distribution

  def __init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self,idx,targets=None):
    device = idx.device
    b, t = idx.size()
    assert t <= self.config.block_size, "context length exceeded"
    pos = torch.arange(0, t, dtype=torch.long, device=device)
    tok_emb = self.transformer.wte(idx)
    pos_embd = self.transformer.wpe(pos)
    x = self.transformer.drop(tok_emb + pos_embd)
    for block in self.transformer.h:
      x = block(x)
    x = self.transformer.ln_f(x)
    if targets is not None:
      logits = self.lm_head(x)
      loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
      return logits, loss
    else:
      logits = self.lm_head(x[:,[-1],:])
      return logits, None

  @torch.no_grad()
  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): #top_k - decoding strategy or top-p / nucleus sampling , temperature
    """
    generate tokens given a conditioning sequence.
    idx:Tensor of shape(B,T)
    """
    for _ in range(max_new_tokens):
      idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
      logits, _ = self(idx_cond)
      logits = logits[:, -1, :] / temperature
      if top_k is not None:
        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
        logits[logits < v[:, [-1]]]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [None]:
config =  GPTConfig(
  vocab_size = 50257, #use the tokenizer's vocab size
  block_size = 128,   # or whatever context size
  n_layer= 6, #no of transformer blocks
  n_head= 6, # no of attention heads - MHA
  n_embd = 384,
  dropout =  0.1,
  bias = True
  )
gpt = GPT(config)

In [None]:
#Step 5: Define the loss function
def estimate_loss(model):
  out = {}
  model.eval()
  with torch.inference_mode():
    for split in ['train', 'val']:
      losses = torch.zeros(eval_iters)
      for k in range(eval_iters):
        X, Y = get_batch(split)
        with ctx:
          logits, loss = model(X, Y)
          losses[k] = loss.item()
        out[split] = losses.mean()
  model.train()
  return out

In [None]:
#Step 6 define the SLM training Loop
#during training use Automate Mixed Precision - converts into float16 and fall back to float32  where needed - this helps in faster training and this is production grade
#for matrix multiplication float 16 is fine to be used.... this is done during matrix multiplication
max_iters = 20000
warm_up_steps = 1000 #smoother intial train , earlier 100
min_lr = 5e-4
eval_iters = 500
batch_size = 32
block_size = 128
gradient_accumulation_steps = 32 # accumlating gradients till step 32 and then back prop and then updating the parameters
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device_type = 'cuda' if 'cuda' in device else 'cpu' #for later use in torch.autocast
#note: float16 data type will automatically use a GradScaler

dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
torch.set_default_device(device)
torch.manual_seed(42)

In [None]:
#step 7: define SLM Training Configuration Part 2
from torch.optim.lr_scheduler import LinearLR,SequentialLR, CosineAnnealingLR
#put in weight decay , changed beta2 to 0.95
#adam is adaptive learning rate .... learning rate is updated every time parameters are updated
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.95), weight_decay=0.1) #use adamw so that we don't get stuck in local minima and training is better
scheduler_warmup = LinearLR(optimizer, start_factor=0.0001, end_factor=1, total_iters=warm_up_steps)
scheduler_decay = CosineAnnealingLR(optimizer, T_max=max_iters-warm_up_steps, eta_min=min_lr)
scheduler = SequentialLR(optimizer, schedulers=[scheduler_warmup, scheduler_decay], milestones=[warm_up_steps]) # learning rate is a combination of warm up and decay like cosine function. Rationle is it leads to better training
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [None]:
#Step 8: Pre-Train SLM
best_val_loss = float('inf')
best_model_params_path = "best_model_params.pt"
train_loss_list , validation_loss_list = [] , []

#ensure model is on correct device
model = model.to(device)

#in your training loop
for epoch in tqdm(range(max_iters)):
  if epoch % eval_iters == 0 and epoch !=0:
    losses = estimate_loss(model)
    print(f"step {epoch}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    print(f"the current learning rate:{optimizer.param_groups[0]['lr']:.5f}")
    train_loss_list+= [losses['train']]
    validation_loss_list+= [losses['val']]
    if losses['val'] < best_val_loss:
      best_val_loss = losses['val']
      torch.save(model.state_dict(), best_model_params_path)

    #ensure X and y are on the correct device
    X,y = get_batch("train")
    X,y = X.to(device), y.to(device)
    with ctx:
      logits, loss = model(X, y)
      loss = loss/gradient_accumulation_steps
      scaler.scale(loss).backward()
    if ((epoch + 1) % gradient_accumulation_steps) == 0 or (epoch+1== max_iters):
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
      scaler.step(optimizer)
      scaler.update() # update the parameters
      optimizer.zero_grad(set_to_none=True)
    scheduler.step()

In [None]:
#Step 9: Plot the SLM Loss Function


In [None]:
#step 10 model inference
#load the model
model = GPT(config)
device = "cuda" if torch.cuda is available() else "cpu"
best_model_params_path = "best_model_params.pt"
model.load_state_dict(torch.load(best_ model_params_path, map_location=torch.device(device))) # load best model states

sentence = "Once upon a time there was a pumpkin."
context = (torch.tensor (enc.encode_ordinary(sentence)).unsqueezze(dim=0))
y = model.generate(context,200)
print(enc.decode(y.squeeze().tolist()))

In [None]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.linear = nn.Linear(10, 1)
        self.bn = nn.BatchNorm1d(10)

    def forward(self, x):
        return self.linear(self.bn(x))

model = MyModel()
state_dict = model.state_dict()

print(state_dict.keys())
# Example output: odict_keys(['linear.weight', 'linear.bias', 'bn.weight', 'bn.bias', 'bn.running_mean', 'bn.running_var', 'bn.num_batches_tracked'])

odict_keys(['linear.weight', 'linear.bias', 'bn.weight', 'bn.bias', 'bn.running_mean', 'bn.running_var', 'bn.num_batches_tracked'])
