<a href="https://colab.research.google.com/github/dev-chaitanya-dewangan/CHATGPT_TRANSFORM_FROM_SCRATCH/blob/main/LLM_FROM_SCRATCH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DATA PREP

In [33]:
import requests
import re
url = "https://www.gutenberg.org/cache/epub/76137/pg76137.txt"
response = requests.get(url)

# Get the text content
full_text = response.text
first_index = full_text.find("PREFACE")
second_index = full_text.find("PREFACE", first_index + 1)
# Preview first 500 characters
book= full_text[second_index:]

In [34]:
preproccesed_cleaned_text = re.split(r'([,.:;?_!"()\']|--|\s)',book)
preporcessed_result=[item.strip() for item in preproccesed_cleaned_text if item.strip()]

# **EXTENED WITH UNK AND ENDOF TEXT**

In [35]:
preporcessed_sorted_result=sorted(set(preporcessed_result))
preporcessed_sorted_result.extend(["<|endoftext|>","<|unk|>"])

In [36]:

vocab={token:id for id,token in enumerate(preporcessed_sorted_result)}


# TOKENIZER


In [37]:
class Tokenizer:
  def __init__(self,vocab):
    self.encoded_vocab=vocab
    self.decoded_vocab={id:word for word,id in vocab.items()}

  def encode(self,text):
    text_cleaning=re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preporccesed_text=[item.strip() for item in text_cleaning if item.strip()]
    tokenized_text = [self.encoded_vocab[i] for i in preporccesed_text]
    return tokenized_text
  def decode(self,tokens):
    text=" ".join([self.decoded_vocab[i] for i in tokens])
    text=re.sub('\s+([,.?!"()\'])',r'\1',text)
    return text

# NEW TOKENIZER EXTENDED WITH THE UNKOWN TEXT AND END OF TEXT\

In [38]:

class TokenizerV2:
  def __init__(self,vocab):
    self.encoded_vocab=vocab
    self.decoded_vocab={id:word for word,id in vocab.items()}

  def encode(self,text):
    text_cleaning=re.split(r'([,.:;?_!"()\']|--|\s)',text)
    preporccesed_text=[item.strip() for item in text_cleaning if item.strip()]
    # tokenized_text = [self.encoded_vocab[i] for i in preporccesed_text] CHANGED HERE
    tokenized_text = [i if i in self.encoded_vocab else "<|unk|>" for i in preporccesed_text]
    return tokenized_text
  def decode(self,tokens):
    text=" ".join([self.decoded_vocab[i] for i in tokens])
    text=re.sub('\s+([,.?!"()\'])',r'\1',text)
    return text


In [39]:
tokenizer=TokenizerV2(vocab)


In [40]:
import tiktoken
import torch
from torch.utils.data import Dataset,DataLoader
class GPTEncoder(Dataset):
  def __init__(self,txt,tokenizer,max_length,stride):
    self.input=[]
    self.target=[]
    tokenized_text=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
    for i in range(0,len(tokenized_text)-max_length,stride):
      input=tokenized_text[i:i+max_length]
      target=tokenized_text[i+1:1+i+max_length]
      self.input.append(torch.tensor(input))
      self.target.append(torch.tensor(target))
  def __len__(self):
    return len(self.input)
  def __getitem__(self,pos):
    return self.input[pos],self.target[pos]

def create_dataloader(txt,batch_size=4,max_length=256
                      ,stride=128,shuffle=True
                      ,drop_last=True,num_workers=0):
  tokenizer=tiktoken.get_encoding("gpt2")
  dataset = GPTEncoder(txt,tokenizer,max_length,stride)
  dataloader=DataLoader(
      dataset=dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader
book= full_text[second_index:]
batch_dataloader=create_dataloader(book,batch_size=1,max_length=12,stride=120,shuffle=False)
data_iter=iter(batch_dataloader)
# first=next(batch_dataloader)
print(next(data_iter))

[tensor([[   47, 31688, 11598,    13,   201,   198,   201,   198,   201,   198,
           464,  1204]]), tensor([[31688, 11598,    13,   201,   198,   201,   198,   201,   198,   464,
          1204,   286]])]


# **SIMPLIFIED SELF ATTENTION**

In [48]:
inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

attenS=torch.empty(inputs.shape[0])
query = inputs[1]
for i,x_i in  enumerate(inputs):
  attenS[i]=torch.dot(x_i,query)

print(attenS)



tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [52]:
attenS=torch.softmax(attenS,dim=0)
print(attenS.sum())

tensor(1.0000)


In [76]:
inputs = torch.tensor(
  [[3,5,9], # Your     (x^1)
   [5,7,6], # journey  (x^2)
   [7,5,4], # starts   (x^3)
   [2,8,3], # with     (x^4)
   [7,5,0], # one      (x^5)
   [5,0,5]] # step     (x^6)
,dtype=torch.float32)

atten_scores=inputs @ inputs.T
attention_weights=torch.softmax(atten_scores,dim=-1)
context=attention_weights @ atten_scores




torch.Size([6, 6])