In [1]:
'''
Hyperparameters
'''

batch_size = 128 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum content length for predictions?

n_embd=64 # embedding dimension (vector size of each token)
dropout=0.2 # dropout rate

n_layer=2 # number of decoder blocks
n_head=4 # Number of attention heads
# head_size = n_embd / n_head = 64/4 = 16

# no epochs are there for the model (since we're sampling random batches)
max_iters = 5000 # number of iterations 
eval_interval = 300 # After how many iterations train loss and val loss are calculated
eval_iters = 200 # how many random batches are sampled to calculate the train and val loss

learning_rate = 2e-3 # with baby networks can afford to go a bit higher

In [4]:
import urllib.request

url = "https://raw.githubusercontent.com/dinakar17/GPT-implemention-from-scratch-in-Pytorch-and-Tensorflow/main/data/HPBooks%20Dataset/HPBooks(1-7).txt"

# Using with context manager to read the file contents
with urllib.request.urlopen(url) as response:
    text = response.read().decode('utf-8')

print("Length of dataset in characters: ", len(text)) # ~ 6.6 million tokens

Length of dataset in characters:  6622027


In [5]:
# First 1000 characters
print(text[:1000])

/




THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive,
were proud to say that they were perfectly normal,
thank you very much. They were the last people you’d
expect to be involved in anything strange or
mysterious, because they just didn’t hold with such
nonsense.

Mr. Dursley was the director of a firm called
Grunnings, which made drills. He was a big, beefy
man with hardly any neck, although he did have a
very large mustache. Mrs. Dursley was thin and
blonde and had nearly twice the usual amount of
neck, which came in very useful as she spent so
much of her time craning over garden fences, spying
on the neighbors. The Dursley s had a small son
called Dudley and in their opinion there was no finer
boy anywhere.

The Dursleys had everything they wanted, but they
also had a secret, and their greatest fear was that
somebody would discover it. They didn’t think they
could bear it if anyone found out about the Potters.
Mrs. Potter was Mrs. Dursley’s sister, but they


In [6]:
# All unique characters that occurs in the text

chars : list = sorted(list(set(text))) 
# character-level vocabulary
vocab_size : int = len(chars) 
# Unique characters
print(''.join(chars))
# Total number of unique characters
print(vocab_size) # i.e.,for  ~ 6.6 millions of tokens vocabulary size is 93


 !"%&'()*,-./0123456789:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\]abcdefghijklmnopqrstuvwxyz|~—‘’“”•■□﻿
93


In [7]:
# create a mapping from characters to integers

from typing import Dict

# Assigning a unique integer to each unique character 
stoi : Dict[str, int] = {ch: i for i, ch in enumerate(chars)}
itos : Dict[int, str] = {i: ch for i, ch in enumerate(chars)}

from typing import Callable, List

encode : Callable[[str], List[int]] = lambda s : [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode : Callable[[List[int]], str] = lambda l : ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode('hiii there'))
print(decode(encode('hii there')))

[63, 64, 64, 64, 1, 75, 63, 60, 73, 60]
hii there


In [9]:
# Encode the entire text dataset and store it into a torch.Tensor
import torch 

data : torch.Tensor = torch.tensor(encode(text), dtype=torch.long) # convert the list of integers to a torch tensor

print(data.shape, data.dtype)
print(data[:100]) # the 100 characters we looked at earlier will look to GPT like this

torch.Size([6622027]) torch.int64
tensor([92, 13,  0,  0,  0,  0,  0, 47, 35, 32,  1, 29, 42, 52,  1, 50, 35, 42,
         1, 39, 36, 49, 32, 31,  0,  0, 40, 73, 12,  1, 56, 69, 59,  1, 40, 73,
        74, 12,  1, 31, 76, 73, 74, 67, 60, 80, 10,  1, 70, 61,  1, 69, 76, 68,
        57, 60, 73,  1, 61, 70, 76, 73, 10,  1, 43, 73, 64, 77, 60, 75,  1, 31,
        73, 64, 77, 60, 10,  0, 78, 60, 73, 60,  1, 71, 73, 70, 76, 59,  1, 75,
        70,  1, 74, 56, 80,  1, 75, 63, 56, 75])


In [10]:
# Todo: Look for another way to split the data
# Split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data : torch.Tensor = data[:n]
val_data : torch.Tensor = data[n:] # validation data helps us understand to what extent the model is overfitting

In [15]:
from typing import Callable, List, Tuple

# get_batch basically samples random batch
def get_batch(split : str) -> Tuple[torch.Tensor, torch.Tensor]:
  # generate a small batch of data of inputs x and targets y 
  data = train_data if split == 'train' else val_data
  # ix is a list of 64 (if batch_size = 64) random integers between 0 and len(data) - block_size (the last possible starting index for a block of size block_size)
  ix : torch.Tensor[int, torch.Size[batch_size]] = torch.randint(len(data) - block_size, (batch_size, )) 
  x : torch.Tensor[int, torch.Size[batch_size, block_size]] = torch.stack([data[i: i+block_size] for i in ix])
  y : torch.Tensor[int, torch.Size[batch_size, block_size]] = torch.stack([data[i+1: i+block_size+1] for i in ix])
  return x, y

# Get one batch of training data
xb, yb = get_batch('train')

In [16]:
import torch.nn as nn

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size: int):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [17]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [18]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [19]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd: int, n_head: int):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
# Todo: Residual connections
    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # Applying layer normalization earlier
        x = x + self.ffwd(self.ln2(x))
        return x

In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337) # for same initialization of weight parameters every time

class GPT(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup token
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # embedding dimension = vocab_size (it could be any)
    self.position_embedding_table = nn.Embedding(block_size, n_embd) 
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    # idx and targets are both (B, T) tensors of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    pos_emb = self.position_embedding_table(torch.arange(T)) # (T, C)
    x = tok_emb + pos_emb # (B, T, C)
    x = self.blocks(x) # (B, T, C)
    x = self.ln_f(x) # (B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits=  logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  
  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:] # Doubt here
        # get the predictions
        logits, loss = self(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx


m = GPT(vocab_size)
logits, loss = m(xb, yb) # Think of logits as output
print(logits.shape)
print(loss)

print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())) # output from an untrained model

torch.Size([1024, 93])
tensor(4.6329, grad_fn=<NllLossBackward0>)

A~DeaQL*s\-!)5|j0-/E(crZ!’;□Uk■x”■vkbYhOXP"DU5a8L%pAF|S!P—z%LD vvWa?q6yz•|id•J“t%.1Ze6lTMX‘x0FHWH(“*


In [21]:
# create a Pytorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

In [22]:
eval_iters = 200

# here we're "sampling random 200 batches with replacement" meaning some batches
# may be repeated twice while other batches aren't even considered 
# this is a common practice for models which deal with gigantic datasets
@torch.no_grad()
def estimate_loss():
  out = {}
  m.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  m.train()
  return out

In [23]:
'''
note that here we cannot say that we're training the model on all the batches
rather we're sampling "max_iters" random batches 
this could lead to selecting same batches more than once or may be left out some batches completely 
but sampling batches with replacement is advantageous than without replacement since 
with replacement is that it allows the model to see different versions of the same examples in different batches. 
This can help prevent the model from overfitting to the training set and can improve generalization performance.
'''

for iter in range(max_iters):
  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of da45ta
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

'''
  if validation loss is less than training loss then don't panic
  since we're sampling 200(eval_iters) random batches with replacement for calculating the loss
  for both train(80%) and val(just 20%, so almost all are involved while calculating the loss) 
  data for saving computational resources at the expense of some
  deviation that could lead to higher value than the "actual train loss across all batches"
  scaling the model or increase the number of iterations will mitigate this issue since more 
  training batches are involved in calculating the loss
'''

Step 0: train loss 4.6295, val loss 4.6310
Step 300: train loss 2.2039, val loss 2.1790
Step 600: train loss 2.0755, val loss 2.0539
Step 900: train loss 2.0040, val loss 1.9970
Step 1200: train loss 1.9707, val loss 1.9596
Step 1500: train loss 1.9345, val loss 1.9481
Step 1800: train loss 1.9119, val loss 1.9187
Step 2100: train loss 1.8941, val loss 1.9042
Step 2400: train loss 1.8814, val loss 1.8828
Step 2700: train loss 1.8764, val loss 1.8872
Step 3000: train loss 1.8598, val loss 1.8739
Step 3300: train loss 1.8617, val loss 1.8688
Step 3600: train loss 1.8482, val loss 1.8603
Step 3900: train loss 1.8384, val loss 1.8450
Step 4200: train loss 1.8288, val loss 1.8426
Step 4500: train loss 1.8251, val loss 1.8447
Step 4800: train loss 1.8246, val loss 1.8310


"\n  if validation loss is less than training loss than don't panic\n  since we're sampling 200 batches (eval_iters) for calculating the loss\n  for both train and val data for computation purpose at the expense of some\n  deviation from the actual loss across all batches\n"

In [26]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())) 


rave you did poighone of Hermioned the stay then, was backen the room a nright that I was was ried to of then, and thick his
abrinistry was bulder of looked of fining of cou, Hearmbagge abre was after, 

“And that. “Wo’l said just stiusing as like have sood, leap of
a suddenordere gocid the,” bane. Duse mort boin,’ quing
witop out lookbook of ekes on he could
not, it know hie sto this that llike the coul
Harry
asmibly, Page | 927Hat servingly of her about caup holiso
litch maded, burry.”

“We- J
