<a href="https://colab.research.google.com/github/dev-chaitanya-dewangan/CHATGPT_TRANSFORM_FROM_SCRATCH/blob/main/LLM_FROM_SCRATCH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA PREP

In [1]:
import requests
import re
url = "https://www.gutenberg.org/cache/epub/76137/pg76137.txt"
response = requests.get(url)

# Get the text content
full_text = response.text
first_index = full_text.find("PREFACE")
second_index = full_text.find("PREFACE", first_index + 1)
# Preview first 500 characters
book= full_text[second_index:]

In [2]:
preproccesed_cleaned_text = re.split(r'([,.:;?_!"()\']|--|\s)',book)
preporcessed_result=[item.strip() for item in preproccesed_cleaned_text if item.strip()]

# **EXTENED WITH UNK AND ENDOF TEXT**

In [3]:
preporcessed_sorted_result=sorted(set(preporcessed_result))
preporcessed_sorted_result.extend(["<|endoftext|>","<|unk|>"])

In [4]:

vocab={token:id for id,token in enumerate(preporcessed_sorted_result)}


# TOKENIZER


In [5]:
class Tokenizer:
  def __init__(self,vocab):
    self.encoded_vocab=vocab
    self.decoded_vocab={id:word for word,id in vocab.items()}

  def encode(self,text):
    text_cleaning=re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preporccesed_text=[item.strip() for item in text_cleaning if item.strip()]
    tokenized_text = [self.encoded_vocab[i] for i in preporccesed_text]
    return tokenized_text
  def decode(self,tokens):
    text=" ".join([self.decoded_vocab[i] for i in tokens])
    text=re.sub('\s+([,.?!"()\'])',r'\1',text)
    return text

# NEW TOKENIZER EXTENDED WITH THE UNKOWN TEXT AND END OF TEXT\

In [54]:

class TokenizerV2:
  def __init__(self,vocab):
    self.encoded_vocab=vocab
    self.decoded_vocab={id:word for word,id in vocab.items()}

  def encode(self,text):
    text_cleaning=re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preporccesed_text=[item.strip() for item in text_cleaning if item.strip()]
    # tokenized_text = [self.encoded_vocab[i] for i in preporccesed_text] CHANGED HERE
    tokenized_text = [i if i in self.encoded_vocab else "<|unk|>" for i in preporccesed_text]
    return tokenized_text
  def decode(self,tokens):
    text=" ".join([self.decoded_vocab[i] for i in tokens])
    text=re.sub('\s+([,.?!"()\'])',r'\1',text)
    return text


In [55]:
tokenizer=TokenizerV2(vocab)


In [56]:
import tiktoken
import torch
from torch.utils.data import Dataset,DataLoader
class GPTEncoder(Dataset):
  def __init__(self,txt,tokenizer,max_length,stride):
    self.input=[]
    self.target=[]
    tokenized_text=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
    for i in range(0,len(tokenized_text)-max_length,stride):
      input=tokenized_text[i:i+max_length]
      target=tokenized_text[i+1:1+i+max_length]
      self.input.append(torch.tensor(input))
      self.target.append(torch.tensor(target))
  def __len__(self):
    return len(self.input)
  def __getitem__(self,pos):
    return self.input[pos],self.target[pos]

def create_dataloader(txt,batch_size=4,max_length=256
                      ,stride=128,shuffle=True
                      ,drop_last=True,num_workers=0):
  tokenizer=tiktoken.get_encoding("gpt2")
  dataset = GPTEncoder(txt,tokenizer,max_length,stride)
  dataloader=DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader
book= full_text[second_index:]
batch_dataloader=create_dataloader(book,batch_size=1,max_length=12,stride=120,shuffle=False)
data_iter=iter(batch_dataloader)
# first=next(batch_dataloader)
print(next(data_iter))

[tensor([[   47, 31688, 11598,    13,   201,   198,   201,   198,   201,   198,
           464,  1204]]), tensor([[31688, 11598,    13,   201,   198,   201,   198,   201,   198,   464,
          1204,   286]])]


# **SIMPLIFIED SELF ATTENTION**

In [11]:
inputs = torch.tensor(
  [[3,5,9], # Your     (x^1)
   [5,7,6], # journey  (x^2)
   [7,5,4], # starts   (x^3)
   [2,8,3], # with     (x^4)
   [7,5,0], # one      (x^5)
   [5,0,5]] # step     (x^6)
,dtype=torch.float32)

atten_scores=inputs @ inputs.T
attention_weights=torch.softmax(atten_scores,dim=-1)
context=attention_weights @ atten_scores




# **GPT _CONFIG**

In [50]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

# **MULTIHEAD ATTENTION**

In [78]:
import torch.nn as nn
class MHAttention(nn.Module):
  def __init__(self,d_in,d_out,cfg):
    super().__init__()


    context_length=cfg['context_length']
    emb_size=cfg['emb_dim']
    qkv_bias=cfg['qkv_bias']

    self.d_out=d_out
    self.num_head=cfg['n_heads']
    self.head_dim=d_out//self.num_head
    assert d_out % self.num_head == 0, "d_out must be divisible by number of heads"

    self.W_key=nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_query=nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_value=nn.Linear(d_in,d_out,bias=qkv_bias)

    self.out_projection=nn.Linear(d_out,d_out)
    self.dropout=nn.Dropout(cfg['drop_rate'])
    self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length)))

  def forward(self,x):
    b,num_token,d_in=x.shape

    key    =  self.W_key(x)
    query  =  self.W_query(x)
    value  =  self.W_value(x)

    key     = key.view(     b , num_token ,self.num_head , self.head_dim)
    query   = query.view(   b , num_token ,self.num_head , self.head_dim)
    value   = value.view(   b , num_token ,self.num_head , self.head_dim)

    key     = key.transpose(1,2)
    query   = query.transpose(1,2)
    value   = value.transpose(1,2)

    attention_score  = query@ key.transpose(2,3)
    mask_bool=self.mask.bool()[:num_token,:num_token]
    attention_score.masked_fill(mask_bool,-torch.inf)

    attention_weights     =torch.softmax(attention_score/key.shape[-1] * 0.5,dim=-1)
    attention_weights  =self.dropout(attention_weights)

    context_vec = (attention_weights @ value).transpose(1,2)

    context_vec = context_vec.contiguous().view(b,num_token,self.d_out)
    context_vec = self.out_projection(context_vec)

    return context_vec




mha = MHAttention(768,768,GPT_CONFIG_124M)
x = torch.randn(2, 1024, 768)
mha(x)

tensor([[[ 3.0145e-02, -1.1210e-02, -9.6214e-03,  ..., -4.1526e-02,
           1.5352e-02, -5.8658e-03],
         [ 2.6093e-02, -1.4181e-02, -5.6811e-03,  ..., -4.4064e-02,
           1.7080e-02,  5.4527e-03],
         [ 2.9279e-02, -1.5545e-02, -9.7792e-03,  ..., -3.8536e-02,
           1.3967e-02, -8.0223e-03],
         ...,
         [ 2.4948e-02, -1.2493e-02, -8.1560e-03,  ..., -4.9425e-02,
           1.5062e-02, -8.5777e-03],
         [ 2.3182e-02, -1.1238e-02, -9.4325e-03,  ..., -4.4405e-02,
           2.0213e-02, -1.3341e-03],
         [ 3.0325e-02, -1.1326e-02, -1.2362e-02,  ..., -3.8602e-02,
           2.1890e-02,  1.0033e-03]],

        [[ 2.0443e-02, -4.6763e-03,  3.0779e-03,  ..., -4.2905e-02,
           3.6219e-02,  6.9410e-04],
         [ 2.0991e-02,  2.3195e-03,  3.8737e-03,  ..., -5.1898e-02,
           3.1167e-02,  5.1967e-03],
         [ 2.0206e-02, -1.5335e-03,  9.7019e-03,  ..., -4.6209e-02,
           3.3122e-02, -3.6337e-05],
         ...,
         [ 1.3722e-02, -6

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

RuntimeError: shape '[2, 3, 12, 0]' is invalid for input of size 36

# **GELU FUNCTION**

In [29]:

class Gelu(nn.Module):
  def __init__(self):
    super().__init__()
  def forward(self,x):
    return 0.5 * x * (torch.tanh(1+torch.sqrt(torch.tensor(2.0/torch.pi)) *(x +0.044715 * torch.pow(x,3))))

In [33]:
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
        Gelu(),
        nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])

    )
  def forward(self,x):
    return self.model(x)

In [45]:
x = torch.rand(12, 31, 768) #A
out=FeedForward(GPT_CONFIG_124M)
out=out(x)


# **SHORTCUT**

# **FULL CHATGPT MODEL IMPLEMENTED**

In [12]:
# #CONFIG FOR GPT2 144M
# GPT2_CONFIG_144M = {

# 	"vocab_size":50257,
# 	"n_heads":12,
# 	"n_layers":12,
# 	"emb_dim":768,
# 	"context_length":1024,
# 	"drop_rate":0.1,
# 	"ff_dim" :3072
# }

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# class LayerNorm(nn.module):
# 	def __init__(self,dim,eps=1e-5):
# 		super().__init__()
# 		self.wieghts =nn.Parameter(torch.ones(dim))
# 		self.bias =nn.Parameter(torch.zeros(dim))
# 		self.eps=eps
# 	def forward(self,x):
# 		mean   =x.mean(-1,keepdim=True)
# 		var    =x.var(-1,keepdim=True,unbiased=False)
# 		x_norm =(x-mean)/torch.sqrt(var+self.eps)
# 		return self.weights*x_norm+self.bias
# class FeedForward(nn.Module):

# 	def __init__(self,cfg):
# 		super().__init__()
# 		self.net=nn.Sequential(
# 			nn.Linear(cfg['emb_dim'],cfg['ff_dim']),
# 			nn.GELU(),
# 			nn.Linear(cfg['ff_dim'],cfg['emb_dim']),
# 			nn.Dropout(cfg['drop_rate'])
# 			)
# 	def forward(self,x):
# 		return self.net(x)
# class MultiHeadAttention(nn.Module):
# 	def __init__(self,cfg):
# 		self.n_heads=cfg['n_heads']
# 		self.emb_dim=cfg['emb_dim']
# 		self.head_dim=self.emb_dim/self.n_heads
# 		assert self.head_dim*self.n_heads == self.emb_dim

# 		self.context_input=nn.Linear(self.emb_dim,3*self.emb_dim)
# 		self.output       =nn.Linear(self.emb_dim,self.emb_dim)
# 		self.dropout      =nn.Droptout(cfg['dropt_rate'])

# 		self.register_buffer("mask",torch.tril(torch.ones(cfg["context_length"],cfg["context_length"])).unsqueeze(0).unsqueeze(0))
# 	def forward(self,x):
# 		B,T,C=x.size()

# 		context_input=self.context_input(x)
# 		context_input=context_input.reshape(B,T,3,self.n_head,self.head_dim)
# 		context_input=context_input.permute(2,0,3,1,4)
# 		q,k,v=context_input[0],context_input[1],context_input[2]

# 		attn_scores=(q@k.transpose(-2,-1) / (self.head_dim**o.5))
# 		attn_probs =F.softmax(attn_scores,dim=-1)
# 		attn_probs =self.dropout(attn_probs)

# 		attn_output=attn_probs @ v
# 		attn_output=attn_output.transpose(1,2).reshape(B,T,C)

# 		output=self.output(attn_output)
# 		output=self.dropout(output)
# 		return output
# class TransformerBlock(nn.Module):
# 	def __init__(self,cfg):
# 		super().__init__()
# 		self.ln1=LayerNorm(cfg['emb_dim'])
# 		self.attn=MultiHeadAttention(cfg)
# 		self.ln2=LayerNorm(cfg['emb_dim'])
# 		self.ff=FeedForward(cfg)
# 	def forward(self,x):
# 		x=x+self.attn(self.ln1(x))
# 		x=x+self.ff(self.ln2(x))
# 		return x
# class GPT2Model(nn.Module):
# 	def __init__(self,cfg):
# 		super().__init__()
# 		self.tok_emb=nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
# 		self.pos_emb =nn.Embedding(cfg['context_length'],cfg['emb_dim'])
# 		self.drop    =nn.Dropout(cfg['drop_rate'])

# 		self.blocks=nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])
# 		self.ln_f=LayerNorm(cfg['emb_dim'])
# 		self.head =nn.Linear(cfg['emb_dim'])
# 	def forward(self,idx):
# 		B,T=idx.size()

# 		tok_emb=self.tok_emb(idx)
# 		pos=torch.arrange(T,device=idx.device)
# 		pos_emb=self.pos_emb(pos)

# 		x=tok_emb+pos_emb
# 		x=self.drop(x)
# 		x-self.blocks(x)
# 		x=self.ln_f(x)
# 		logits=self.head(x)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-12-957674d0f928>, line 59)