# Building a GPT


Importing the dataset from kaggle --- I've used Wikipedia Dataset.

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mikeortman/wikipedia-sentences")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\dccha\.cache\kagglehub\datasets\mikeortman\wikipedia-sentences\versions\3


In [2]:
# set the path dir
import os
base_path = os.path.join(path, 'wikisent2.txt')
print(base_path)
# read it in to inspect it
with open(base_path,'r',encoding='utf-8') as f:
  text = f.read()

C:\Users\dccha\.cache\kagglehub\datasets\mikeortman\wikipedia-sentences\versions\3\wikisent2.txt


In [3]:
print("Length of dataset in characters: ",len(text))

Length of dataset in characters:  934571982


In [4]:
# let us see the first 1500 characters
print(text[:2000])

0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc.
000webhost is a free web hosting service, operated by Hostinger.
0010x0010 is a Dutch-born audiovisual artist, currently living in Los Angeles.
0-0-1-3 is an alcohol abuse prevention program developed in 2004 at Francis E. Warren Air Force Base based on research by the National Institute on Alcohol Abuse and Alcoholism regarding binge drinking in college students.
0.01 is the debut studio album of H3llb3nt, released on February 20, 1996 by Fifth Colvmn Records.
001 of 3 February 1997, which was signed between the Government of the Republic of Rwanda, and FAPADER.
003230 is a South Korean food manufacturer.
0.04%Gas molecules in soil are in continuous thermal motion according to the kinetic theory of gasses, there is also collision between molecules - a random walk.
0.04% of the votes were invalid.
005.1999.06 is the fifth studio album by the South Korean singer and actress Uhm Jung-hwa.
005 is a 1981 arcade game by Sega.

In [5]:
# listing the unique characters that occur in text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print("Vocabulary size : ",vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
Vocabulary size :  96


In [6]:
chars[42]

'I'

In [7]:
# Mapping from characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
# encoder: it will take the string and output a list of integer
encode = lambda s: [stoi[c] for c in s]
# decoder: it will take the list of integer and output the string
decode = lambda l: ''.join([itos[u] for u in l])

print(encode('I am Anirban Chakraborty'))
print(decode(encode('I am Anirban Chakraborty')))

[42, 1, 66, 78, 1, 34, 79, 74, 83, 67, 66, 79, 1, 36, 73, 66, 76, 83, 66, 67, 80, 83, 85, 90]
I am Anirban Chakraborty


In [8]:
# Now let us encode the entire text dataset and store it in a Tensor
# for this operation we will use torch.Tensor
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape, data.dtype)
print(data[:2000]) #first 2000 characters after encoding

torch.Size([934571982]) torch.int64
tensor([17, 15, 17,  ..., 90,  1, 72])


In [9]:
print(text[:100])
print(data[:100])

0.000123, which corresponds to a distance of 705 Mly, or 216 Mpc.
000webhost is a free web hosting s
tensor([17, 15, 17, 17, 17, 18, 19, 20, 13,  1, 88, 73, 74, 68, 73,  1, 68, 80,
        83, 83, 70, 84, 81, 80, 79, 69, 84,  1, 85, 80,  1, 66,  1, 69, 74, 84,
        85, 66, 79, 68, 70,  1, 80, 71,  1, 24, 17, 22,  1, 46, 77, 90, 13,  1,
        80, 83,  1, 19, 18, 23,  1, 46, 81, 68, 15,  0, 17, 17, 17, 88, 70, 67,
        73, 80, 84, 85,  1, 74, 84,  1, 66,  1, 71, 83, 70, 70,  1, 88, 70, 67,
         1, 73, 80, 84, 85, 74, 79, 72,  1, 84])


In [10]:
# let us now split the data up into train and validation sets
n = int(0.9*len(data)) # training on the first 90% , rest val
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 8
train_data[:block_size+1]

tensor([17, 15, 17, 17, 17, 18, 19, 20, 13])

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is {target}")

When input is tensor([17]), the target is 15
When input is tensor([17, 15]), the target is 17
When input is tensor([17, 15, 17]), the target is 17
When input is tensor([17, 15, 17, 17]), the target is 17
When input is tensor([17, 15, 17, 17, 17]), the target is 18
When input is tensor([17, 15, 17, 17, 17, 18]), the target is 19
When input is tensor([17, 15, 17, 17, 17, 18, 19]), the target is 20
When input is tensor([17, 15, 17, 17, 17, 18, 19, 20]), the target is 13


In [13]:
torch.manual_seed(1000)
batch_size = 4 # how many sequences will we process in parallel?
block_size = 8 # what is the maximum context length for prediction?

def get_batch(split):
    # generate a small batch of data of inputs x and target y
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs: ')
print(xb.shape)
print(xb)
print('targets: ')
print(yb.shape)
print(yb)

print()

for b in range(batch_size): #batch dimn
    for t in range(block_size): #time dimn
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()}, the target is {target}")

inputs: 
torch.Size([4, 8])
tensor([[69, 90,  1, 83, 80, 86, 72, 73],
        [ 1, 85, 73, 70,  1, 35, 83, 74],
        [ 1, 51, 70, 85, 86, 83, 79,  1],
        [83, 70, 66, 69, 74, 79, 72,  1]])
targets: 
torch.Size([4, 8])
tensor([[90,  1, 83, 80, 86, 72, 73, 77],
        [85, 73, 70,  1, 35, 83, 74, 85],
        [51, 70, 85, 86, 83, 79,  1, 80],
        [70, 66, 69, 74, 79, 72,  1, 87]])

When input is [69], the target is 90
When input is [69, 90], the target is 1
When input is [69, 90, 1], the target is 83
When input is [69, 90, 1, 83], the target is 80
When input is [69, 90, 1, 83, 80], the target is 86
When input is [69, 90, 1, 83, 80, 86], the target is 72
When input is [69, 90, 1, 83, 80, 86, 72], the target is 73
When input is [69, 90, 1, 83, 80, 86, 72, 73], the target is 77
When input is [1], the target is 85
When input is [1, 85], the target is 73
When input is [1, 85, 73], the target is 70
When input is [1, 85, 73, 70], the target is 1
When input is [1, 85, 73, 70, 1], th

### Simplest Model

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1000)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):

        #idx and targets are both (B,T) tensor of integer
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape

            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
        
    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get prediction
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes B,C
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # B,C
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            # append the sampled index to the running sequence
            idx = torch.cat((idx,idx_next),dim=1) # B,T+1
        return idx


model = BigramLanguageModel(vocab_size)
logits, loss = model(xb,yb)
print(logits.shape)
print(loss)

print(decode(model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 96])
tensor(5.0479, grad_fn=<NllLossBackward0>)

ky1x[Xfq2H}#_+@#x4PFcgY>%>$@w{Ih4>ZBpAvn`5em:glZ:O^bfU'BIVXUT$ X.>x+h`oB@mmaoK;,cPl|q_UM(xdbW[N}v^["


In [15]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for steps in range(10000): 

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.505908250808716


In [17]:
print(decode(model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens = 500)[0].tolist()))


Govy ans wsh-t ss Eilm:Pan but jond inrn herq|Ed M-Bar igqsLy as subormused tentiche.
Thendqus.
Son icha d tendis om semerocopondust id ahiond abld Cuo DItaowithes.
Hon 1997-Wadetasess thorofof a Upe creras Ade Je s rtenthen flo%piisprorewayepe t mencen is tha che thesheror by Resce, GCherabuedecl, 1, chanow Je bowh Theareanal, ttal co inguded eredamilenuly finas preserialsian 195Thiseckor, J aneem thile Upraucomech is asphr.
Pshemais bean ics Bour, itt le Capr acothes alm fireraras owinden ond 


## Self-Attention Implementation

In [18]:
# Example
torch.manual_seed(0)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# bag of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t,C)
        xbow[b,t] = torch.mean(xprev, 0) #xbow[b,t] = mean(i<=t) x[b,i]

In [20]:
x

tensor([[[-1.1258, -1.1524],
         [-0.2506, -0.4339],
         [ 0.8487,  0.6920],
         [-0.3160, -2.1152],
         [ 0.3223, -1.2633],
         [ 0.3500,  0.3081],
         [ 0.1198,  1.2377],
         [ 1.1168, -0.2473]],

        [[-1.3527, -1.6959],
         [ 0.5667,  0.7935],
         [ 0.5988, -1.5551],
         [-0.3414,  1.8530],
         [ 0.7502, -0.5855],
         [-0.1734,  0.1835],
         [ 1.3894,  1.5863],
         [ 0.9463, -0.8437]],

        [[-0.6136,  0.0316],
         [-0.4927,  0.2484],
         [ 0.4397,  0.1124],
         [ 0.6408,  0.4412],
         [-0.1023,  0.7924],
         [-0.2897,  0.0525],
         [ 0.5229,  2.3022],
         [-1.4689, -1.5867]],

        [[-0.6731,  0.8728],
         [ 1.0554,  0.1778],
         [-0.2303, -0.3918],
         [ 0.5433, -0.3952],
         [-0.4462,  0.7440],
         [ 1.5210,  3.4105],
         [-1.5312, -1.2341],
         [ 1.8197, -0.5515]]])

In [21]:
xbow

tensor([[[-1.1258, -1.1524],
         [-0.6882, -0.7931],
         [-0.1759, -0.2981],
         [-0.2109, -0.7524],
         [-0.1043, -0.8546],
         [-0.0286, -0.6608],
         [-0.0074, -0.3896],
         [ 0.1331, -0.3718]],

        [[-1.3527, -1.6959],
         [-0.3930, -0.4512],
         [-0.0624, -0.8192],
         [-0.1321, -0.1511],
         [ 0.0443, -0.2380],
         [ 0.0080, -0.1678],
         [ 0.2054,  0.0828],
         [ 0.2980, -0.0330]],

        [[-0.6136,  0.0316],
         [-0.5531,  0.1400],
         [-0.2222,  0.1308],
         [-0.0064,  0.2084],
         [-0.0256,  0.3252],
         [-0.0696,  0.2798],
         [ 0.0150,  0.5687],
         [-0.1705,  0.2993]],

        [[-0.6731,  0.8728],
         [ 0.1911,  0.5253],
         [ 0.0506,  0.2196],
         [ 0.1738,  0.0659],
         [ 0.0498,  0.2016],
         [ 0.2950,  0.7364],
         [ 0.0341,  0.4549],
         [ 0.2573,  0.3291]]])

In [22]:
torch.manual_seed(42)
a = torch.triu(torch.ones(3, 3))
a = a / torch.sum(a,1,keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[0.3333, 0.3333, 0.3333],
        [0.0000, 0.5000, 0.5000],
        [0.0000, 0.0000, 1.0000]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[4.6667, 5.3333],
        [6.0000, 4.5000],
        [6.0000, 5.0000]])


In [23]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1,keepdim=True)
xbow2 = wei @ x # (B,T,T) @ (B,T,C) --->  

In [24]:
torch.allclose(xbow2,xbow)

True

In [25]:
xbow[0], xbow2[0]

(tensor([[-1.1258, -1.1524],
         [-0.6882, -0.7931],
         [-0.1759, -0.2981],
         [-0.2109, -0.7524],
         [-0.1043, -0.8546],
         [-0.0286, -0.6608],
         [-0.0074, -0.3896],
         [ 0.1331, -0.3718]]),
 tensor([[-1.1258, -1.1524],
         [-0.6882, -0.7931],
         [-0.1759, -0.2981],
         [-0.2109, -0.7524],
         [-0.1043, -0.8546],
         [-0.0286, -0.6608],
         [-0.0074, -0.3896],
         [ 0.1331, -0.3718]]))

In [26]:
# version 3: softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow3, xbow)

True

In [40]:
# version 4 : self - attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)


# query vector : What am I looking for ?
# key vector : What do I contain ?
# wei = queryvector * keyvector
# Single head perform self-attention
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)
k = key(x) #(B,T,head_size)
q = query(x) #(B,T,head_size)

wei = q @ k.transpose(-2, -1) # (B,T,16) @ (B,16,T) -> (B,T,T)


tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=-1)

v =  value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [41]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [2]:
text = "hello world"
chars = sorted(list(set(text)))  # unique chars
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# Encoding
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

encoded = encode("hello world")
print(encoded)  # e.g., [2, 1, 3, 3, 4, 0, 5, 4, 6, 3, 7]

[3, 2, 4, 4, 5, 0, 7, 5, 6, 4, 1]


In [17]:
import torch
import torch.nn as nn
vocab_size = len(chars)
n_embd = 11  # vector dimension for embeddings

token_embedding_table = nn.Embedding(vocab_size, n_embd)

# Convert to tensor
encoded_tensor = torch.tensor(encoded)  # (T,)
tok_emb = token_embedding_table(encoded_tensor)  # (T, n_embd)
print(tok_emb.shape)  # (11, 4)
print(tok_emb[0])
print()

torch.Size([11, 11])
tensor([ 1.1859,  0.2931, -1.4211,  1.6499, -1.7478,  0.5847,  0.0357, -1.5534,
         0.6134, -0.0026, -0.1501], grad_fn=<SelectBackward0>)

