In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import seaborn
import inspect
import torch.nn as nn
from dataclasses import  dataclass
import math
from torch.distributed import init_process_group
import os

In [27]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [28]:
with  open('/content/input (2).txt', mode = 'r', encoding = 'utf-8') as f:
  text =  f.read()

In [29]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [30]:
chars = sorted(list(set(text[:])))

In [31]:
print(f'unique chars:{len(chars)}')
print(f'----------------------')
print(f'lenght of text:{len(text)}')

unique chars:65
----------------------
lenght of text:1115394


In [32]:
!pip install tiktoken



In [33]:
import tiktoken

In [34]:
enc =  tiktoken.encoding_for_model("gpt2")

In [35]:
def encode(text):
  return enc.encode(text)

In [36]:
def decode(ids):
  return enc.decode(ids)

In [37]:
stoi = {i:v for v, i in enumerate(chars)}

In [38]:
class DataLoaderLite:

  def __init__(self , B, T):
    self.B    =  B
    self.T    =  T
    self.path = "/content/input (2).txt"
    self.tokenizer =  tiktoken.encoding_for_model("gpt2")

    with open(self.path , mode =  'r', encoding = 'utf-8') as f:
      text  =  f.read()
    self.tokens =  self.tokenizer.encode(text[:])
    self.tokens =  torch.tensor(self.tokens,dtype  = torch.long)
    print(f'Loading :{len(self.tokens)} tokens')
    print(f'1 Epoch = {len(self.tokens) // self.B * self.T}')
    self.current_batch =  0

  def next_batch(self):
    B, T = self.B, self.T
    buf  = self.tokens[self.current_batch:B*T+1+self.current_batch]
    x    = buf[:-1].view(B, T)
    y    = buf[1:].view(B, T)
    self.current_batch+=B*T
    if len(self.tokens) < (B*T+1) + self.current_batch:
      self.current_batch =  0

    return x, y

In [39]:
if torch.cuda.is_available():
  device =  "cuda"
else:
  device = "cpu"
print(f"Using device:{device}")

Using device:cpu


In [40]:
enc =  tiktoken.encoding_for_model('gpt2')

In [41]:
@dataclass
class GPTConfig:
  vocab_size:int = enc.n_vocab
  n_embd:int = 64
  head_size:int = 64
  n_layers:int= 8
  n_heads:int=8
  block_size:int = 8

In [42]:
class Attention(nn.Module):

  def __init__(self,head_size):

    super().__init__()
    self.config =  GPTConfig()
    self.head_size =  head_size
    self.query     =  nn.Linear(self.config.n_embd, self.head_size, bias  =  False)
    self.key       =  nn.Linear(self.config.n_embd, self.head_size, bias  =  False)
    self.value     =  nn.Linear(self.config.n_embd, self.head_size, bias  =  False)
    self.register_buffer('tril', torch.tril(torch.ones(self.config.block_size, self.config.block_size)))
    self.wei =  0

  def forward(self , x):

    B,T,C =  x.shape
    q = self.query(x) #INPUT-x-->(B,T,C) @ (C,H) ----> (B*T,C) @ (C,H)--->(B,T,H)
    k = self.key(x)   #INPUT-x-->(B,T,C) @ (C,H) ----> (B*T,C) @ (C,H)--->(B,T,H)
    v = self.value(x) #INPUT-x-->(B,T,C) @ (C,H) ----> (B*T,C) @ (C,H)--->(B,T,H)

    self.wei =  q @ k.transpose(-2,-1)*C**-0.5 #(B,T,H ) @(B,T,H)--->(B,T,H) @ (B,H,T)---->(B,T,T)
    wei =  self.wei.masked_fill(self.tril[:T,:T]==0 , float('-inf')) # (B,T ,T)
    wei =  F.softmax(wei , dim=-1)
    out =  wei @ v  #(B, T, T) @ (B, T, H)---->(B,T,H)
    #out =  F.scaled_dot_product_attention(q, k, v, is_causal =  True) #(B, T, H) #  FLASH_ATTENTION
    return out

In [43]:
class MultiHeadAttention(nn.Module):

  def __init__(self):

    super().__init__()
    self.config    = GPTConfig()
    self.head_size = self.config.n_embd // self.config.n_heads
    self.heads     = nn.ModuleList([Attention(self.head_size) for _ in range(self.config.n_heads)])
    self.proj      = nn.Linear(self.head_size * self.config.n_heads ,self.config.n_embd)
    self.proj.NANOGPT_SCALE_INIT =  1

  def  forward(self, x):
    out =  torch.cat([h(x) for h in self.heads], dim =  -1)
    return self.proj(out) # (B*T,H) @ (C,C)---> (B,T,H)

In [44]:
class MLP(nn.Module):

  def __init__(self):

    super().__init__()
    self.config = GPTConfig()
    self.mlp   = nn.Linear(self.config.n_embd,4 * self.config.n_embd)
    self.gelu  = nn.GELU()
    self.proj  = nn.Linear(4 *  self.config.n_embd, self.config.n_embd)
    self.proj.NANOGPT_SCALE_INIT=1

  def forward(self  ,x):
    x = self.mlp(x)  # (B,T,C) @ (C,C*4)---->(B,T,C*4)
    x = self.gelu(x) # (B,T,C) ----> (B,T,C)
    x = self.proj(x) # (B,T,C*4) @ (C*4,C) ----- >(B,T,C)

    return x

In [45]:
class Block(nn.Module):

  def __init__(self):
    super().__init__()
    self.config  = GPTConfig()
    self.mlp =  MLP()
    self.mha = MultiHeadAttention()
    self.ln1 = nn.LayerNorm(self.config.n_embd)
    self.ln2 = nn.LayerNorm(self.config.n_embd)

  def forward(self , x):
    x =  x +  self.mha(self.ln1(x)) #INPUT---> (B,T,C/n_heads)--->OUTPUT->(B,T,C*n_heads) + (B,T,C)
    x =  x +  self.mlp(self.ln2(x)) #INPUT(B,T,C)---->OUTPUT(B,T,C)

    return x

In [46]:
class GPT2(nn.Module):

  def __init__(self, config):
    super().__init__()
    self.config =  config

    self.transformer = nn.ModuleDict(dict(
        wte = nn.Embedding(config.vocab_size, config.n_embd),
        wpe = nn.Embedding(config.block_size, config.n_embd),
        bl  = nn.ModuleList([Block() for _ in range(self.config.n_layers)]),
        lnu = nn.LayerNorm(self.config.n_embd)
    ))

    self.lin  = nn.Linear(self.config.n_embd, self.config.vocab_size)
    self.transformer.wte.weight =  self.lin.weight
    self.apply(self._init_weights)


  def forward(self , idx, targets =  None):
    B, T       =  idx.shape
    tokens_emb =  self.transformer.wte(idx) # INPUT (B,T)----> OUTPUT (B,T,C)
    pos_emb    =  self.transformer.wpe(torch.arange(T, device = device)) #INPUT(T)--->OUTPUT(B,T)
    x          =  tokens_emb +  pos_emb #(B, T) + (B, T, C)----> (B,T ,C)
    for h in self.transformer.bl:
      x = h(x)
    logits     =  self.lin(x) #INPUT(B, T, C)--->(B*T,C)----->(B,T,C)

    if targets is None:
      loss = None

    else:
      B, T ,C =  logits.shape
      logits  = logits.view(B*T, C) #   (B,T ,C)---->(B*T,C)
      targets = targets.view(B*T)   #   (B, T)--->(B)
      loss    =  F.cross_entropy(logits, targets)

    return logits , loss

  def _init_weights(self, module):
    if isinstance(module , nn.Linear):
      std=0.02
      if hasattr(module ,"NANOGPT_SCALE_INIT"):
        std*=(2*self.config.n_layers)**-0.5
      torch.nn.init.normal_(module.weight, mean =  0.0 , std = std)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
    elif isinstance(module , nn.Embedding):
      torch.nn.init.normal_(module.weight, mean = 0.0 ,std =  0.02)

  def configure_optimizers(self, weight_decay , device, learning_rate):
    param_dict = {pn:p for pn,p  in self.named_parameters()}
    param_dict = {pn:p for pn,p in param_dict.items() if p.requires_grad}

    decayed_params   = [p for n,p in param_dict.items() if p.ndim >=2 ]
    nodecayed_params = [p for n,p in param_dict.items() if p.ndim <=1 ]

    options_groups = [
        {"params":decayed_params,"weight_decay":weight_decay},
        {"params":nodecayed_params,"weight_decay":0.0}
    ]

    num_decayed_params   = sum(p.numel() for p  in decayed_params)
    num_nodecayed_params = sum(p.numel() for p  in nodecayed_params)

    print(f'num tensor decayed_params:{len(decayed_params)} with:{num_decayed_params}')
    print(f'------------------------------------------------')
    print(f'num no tensor decayed_params:{len(nodecayed_params)} with:{num_nodecayed_params}')

    fused_available =  "fused"  in inspect.signature(torch.optim.AdamW).parameters
    using_fused           = fused_available and "cuda" in device
    print(f"Using fused AdamW:{using_fused}")

    optim = torch.optim.AdamW(options_groups,lr = learning_rate, betas = (0.9,0.95),eps = 1e-8,fused = using_fused)
    return optim


  def generate(self, idx, max_new_tokens):
    for i in range(max_new_tokens):
      idx_cond      = idx[:,-GPTConfig.block_size:]
      logits,  loss =  self(idx_cond)
      logits        =  logits[:,-1,:] #(B,T,C) ---->(B,C)
      probs         =  F.softmax(logits, dim =-1 ) #(B, C)------>(B,C)
      next_idx      =  torch.multinomial(probs, num_samples =  1) #(B,C)--->(B,1)
      idx           =  torch.cat([idx,next_idx], dim=1)
    return idx

In [47]:
gpt2 =  GPT2(GPTConfig)
gpt2.to(device)
gpt2 = torch.compile(gpt2)

In [48]:
@dataclass
class CosineDecayParamaters:
  max_steps:int=1000
  max_lr:float=3e-4
  min_lr:float=max_lr * 0.1
  warmup_steps:int=10

In [49]:
op    =  CosineDecayParamaters()
optim = gpt2.configure_optimizers(0.01,device, learning_rate = 6e-4)

num tensor decayed_params:218 with:3610176
------------------------------------------------
num no tensor decayed_params:59 with:55505
Using fused AdamW:False


In [50]:
torch.manual_seed(1337)
D = DataLoaderLite(4,8)

Loading :338025 tokens
1 Epoch = 676048


In [51]:
ddp = int(os.environ.get('RANK',-1))!=-1

In [52]:
if ddp :
  assert torch.cuda.is_available()
  init_process_group(backend='nccl')
  ddp_rank = int(os.environ['RANK'])
  ddp_local_rank = int(os.environ['LOCAL_RANK'])
  ddp_world_size = int(os.environ['WORLD_SIZE'])
  device         = f"cuda:{ddp_local_rank}"
  torch.cuda.set_device(device)
  master_process = ddp_rank==0
else:
  ddp_rank =  0
  ddp_local_rank = 0
  master_process =  True
  ddp_world_size = 1

  device =  "cpu"
  if torch.cuda.is_available():
    device = "cuda"
  elif  hasattr(torch.backends,"mps") and torch.backends.mps.is_available():
    device ="mps"
  print(f'using device:{device}')

using device:cpu


In [53]:
TOTAL_BATCHES = 512
assert TOTAL_BATCHES % (D.B*D.T*ddp_world_size) == 0
grad_acc = TOTAL_BATCHES //  (D.B * D.T * ddp_world_size)
print(f'total batches:{TOTAL_BATCHES}')
print(f'gradient accumulation steps:{grad_acc}')

total batches:512
gradient accumulation steps:16


In [54]:
def get_lr(it):
  if op.warmup_steps > it:
    return op.max_lr * (it + 1) / op.warmup_steps

  if it >  op.max_steps:
    return op.min_lr

  decay_ratio = (it - op.warmup_steps) / (op.max_steps -  op.warmup_steps)
  assert 0<=decay_ratio<=1
  coeff =  0.5*(1.0 +  math.cos(math.pi * decay_ratio))
  return op.min_lr + coeff*(op.max_lr - op.min_lr)

In [None]:
for steps in range(op.max_steps):
  loss_acc = 0.0
  optim.zero_grad(set_to_none = True)
  for grad_steps in range(grad_acc):
    xb,   yb = D.next_batch()
    xb,  yb  =  xb.to(device), yb.to(device)
    with torch.autocast(device_type =  device, dtype = torch.bfloat16):
        logits , loss = gpt2(xb, yb)
        loss =  loss / grad_acc
        loss_acc+= loss.detach()
    loss.backward()
  norm =  torch.nn.utils.clip_grad_norm_(gpt2.parameters(), 0.1)
  lr   = get_lr(steps)
  for p in optim.param_groups:
    p['lr'] = lr
  optim.step()
  print(f'loss {loss_acc.item():.4f} | norm:{norm:.4f} | lr {lr:.4f}')

loss 10.8239 | norm:0.2267 | lr 0.0000
loss 10.8239 | norm:0.2025 | lr 0.0001
loss 10.8221 | norm:0.2272 | lr 0.0001
loss 10.8204 | norm:0.2170 | lr 0.0001
loss 10.8184 | norm:0.2259 | lr 0.0001
loss 10.8135 | norm:0.2599 | lr 0.0002
loss 10.8049 | norm:0.3125 | lr 0.0002
loss 10.7993 | norm:0.2950 | lr 0.0002
loss 10.7907 | norm:0.3120 | lr 0.0003
loss 10.7734 | norm:0.3912 | lr 0.0003
loss 10.7501 | norm:0.4920 | lr 0.0003
loss 10.7175 | norm:0.6236 | lr 0.0003
loss 10.7023 | norm:0.6095 | lr 0.0003
loss 10.6462 | norm:0.8406 | lr 0.0003
loss 10.6121 | norm:0.8943 | lr 0.0003
loss 10.5466 | norm:1.0890 | lr 0.0003
loss 10.5114 | norm:1.1314 | lr 0.0003
loss 10.4447 | norm:1.2600 | lr 0.0003
loss 10.3609 | norm:1.4479 | lr 0.0003
loss 10.2377 | norm:1.8688 | lr 0.0003
loss 10.1311 | norm:1.9408 | lr 0.0003
loss 9.9658 | norm:2.4852 | lr 0.0003
loss 9.9991 | norm:2.0644 | lr 0.0003
loss 9.7494 | norm:2.5889 | lr 0.0003
loss 9.5276 | norm:3.0951 | lr 0.0003
loss 9.3523 | norm:3.4839 | l

In [None]:
print(decode(gpt2.generate(torch.zeros(1,device=device ,dtype =  torch.long).repeat(2).unsqueeze(1),max_new_tokens=1000)[0].tolist()))

In [None]:
param_dict    =   {pn:p for pn,p in gpt2.named_parameters()}
param_dict    =   {pn:p for pn,p in param_dict.items() if p.requires_grad}

In [None]:
att = gpt2.transformer.bl[0].mha.heads[0].wei[0]

In [None]:
xb , yb = D.next_batch()

In [None]:
xb[0]

In [None]:
tokens =  [i.item() for i in xb[0]]

In [None]:
tokens

In [None]:
T = []
for t in tokens:
  x = decode([t])
  T.append(x)

In [None]:
T

In [None]:
att_transpose =  att.T

In [None]:
att

In [None]:
att_transpose = att

In [None]:
att_matrix =  pd.DataFrame({
    f"{T[0]}":[i.item() for i  in att_transpose[:,0]],
    f"{T[1]}":[i.item() for i  in att_transpose[:,1]],
    f"{T[2]}":[i.item() for i  in att_transpose[:,2]],
    f"{T[3]}":[i.item() for i  in att_transpose[:,3]],
    f"{T[4]}":[i.item() for i  in att_transpose[:,4]],
    f"{T[5]}":[i.item() for i  in att_transpose[:,5]],
    f"{T[6]}":[i.item() for i  in att_transpose[:,6]],
    f"{T[7]}":[i.item() for i  in att_transpose[:,7]],
})

In [None]:
att_matrix

In [None]:
att_matrix_transpose = {}
for p in range(8):
  att_matrix_transpose[T[p]] = [i.item() for i in att_matrix[T,:]]

In [None]:
att_matrix

In [None]:
import seaborn as sns
import numpy as np

In [None]:
A =  np.asarray(att_matrix.iloc[0,:])

In [None]:
A

In [None]:
np.sum(A,axis =  0)

In [None]:
corr_matrix =  att_matrix.corr()

In [None]:
corr_matrix

In [None]:
sns.heatmap(att_matrix,xticklabels=corr_matrix.columns.values , yticklabels=corr_matrix.columns.values,annot=True)