# Generative Pretrained Transformer (GPT)
- Based on the paper 'Attention is all you need" - https://arxiv.org/abs/1706.03762
- Built on a toy dataset - tiny Shakespeare

### GETTING TO KNOW THE DATA

In [1]:
## Analyzing the dataset
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("Length of the dataset (in characters) :",  len(text))

Length of the dataset (in characters) : 1115394


In [3]:
# The first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



### PREPROCESSING

In [4]:
## Extracting all the unique characters thay occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars);
print("Unique characters: ")
print(chars)
print("The number of unique characters: ", vocab_size)

Unique characters: 
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
The number of unique characters:  65


In [5]:
# Creating a mapping from characters to integers (using the default mapping in the set of unique characters we made)
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

## WE CAN ALSO USE THE TIKTOKEN LIBRARY TO ENCODE
# import tiktoken
# enc = tiktoken.get_encoding('gpt-2')
# enc.n_vocab # Vocab size of this encoding
# enc.encode("Hi there")
# enc.decode([71, 4178, .....])

In [6]:
# Lets encode the entire text file (i.e., our dataset)
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print("Shape of the encoded dataset: ", data.shape, data.dtype)
print("The first 100 characters: ")
print(data[:100])

Shape of the encoded dataset:  torch.Size([1115394]) torch.int64
The first 100 characters: 
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
# Split the dataset into train and validation
n = int(0.9 * len(data)) # 90% of the data
train_data = data[:n]
val_data = data[n:]

In [8]:
# Chunking the text data for training (taking chunks of the data one at a time and predicting what comes after the chunk)
block_size = 8

# Example of how our input is gonna be:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is {target}")

When input is tensor([18]), the target is 47
When input is tensor([18, 47]), the target is 56
When input is tensor([18, 47, 56]), the target is 57
When input is tensor([18, 47, 56, 57]), the target is 58
When input is tensor([18, 47, 56, 57, 58]), the target is 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [9]:
# Creating our real training data
# Mini Batch - the data is split into number of batches, with each batch containing chunks

torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    # Generates a small batch of inputs x and targets y
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs:")
print(xb.shape)
print(xb)
print("Targets:")
print(yb.shape)
print(yb)

print("---------------")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()}, the target is {target}")

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---------------
When input is [24], the target is 43
When input is [24, 43], the target is 58
When input is [24, 43, 58], the target is 5
When input is [24, 43, 58, 5], the target is 57
When input is [24, 43, 58, 5, 57], the target is 1
When input is [24, 43, 58, 5, 57, 1], the target is 46
When input is [24, 43, 58, 5, 57, 1, 46], the target is 43
When input is [24, 43, 58, 5, 57, 1, 46, 43], the target is 39
When input is [44], the target is 53
When input is [44, 53], the target is 56
When input is [44, 53, 56], the target is 1
When input is [44, 53, 56, 1], the target is 58
When input is [44, 53, 

### THE MODEL - BIGRAM CHARACTER LEVEL LANGUAGE MODEL

In [10]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BLM(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            # Reshaping our logits to be compatible with the 'F.cross_entropy' function
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
            
        

m = BLM(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)

test_input = torch.zeros((1, 1), dtype=torch.long)
output_sampled = m.generate(test_input, max_new_tokens=100)[0].tolist()
print(decode(output_sampled))

torch.Size([32, 65]) tensor(5.0364, grad_fn=<NllLossBackward0>)

l-QYjt'CL?jLDuQcLzy'RIo;'KdhpV
vLixa,nswYZwLEPS'ptIZqOZJ$CA$zy-QTkeMk x.gQSFCLg!iW3fO!3DGXAqTsq3pdgq


In [11]:
# Creating an optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

# training loop
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

4.658271312713623
4.728822231292725
4.675827503204346
4.676178455352783
4.511743545532227
4.51875114440918
4.621933460235596
4.682680130004883
4.587071895599365
4.555554389953613
4.606744766235352
4.68364143371582
4.680752754211426
4.636767387390137
4.591298580169678
4.671515464782715
4.601475238800049
4.680617809295654
4.641451835632324
4.6342363357543945
4.649167537689209
4.661771297454834
4.575148105621338
4.587265968322754
4.603781700134277
4.621764183044434
4.591093063354492
4.524297714233398
4.544114112854004
4.688445091247559
4.699739933013916
4.555056095123291
4.5871052742004395
4.694080829620361
4.517195701599121
4.561908721923828
4.520933151245117
4.63019323348999
4.543490886688232
4.466386318206787
4.4868950843811035
4.5466156005859375
4.562035083770752
4.654576778411865
4.707533359527588
4.67865514755249
4.630898475646973
4.667072296142578
4.565168380737305
4.573022842407227
4.588326454162598
4.586295127868652
4.684608459472656
4.653557777404785
4.547926425933838
4.54403734

In [12]:
# Sampling the model post training
test_input = torch.zeros((1, 1), dtype=torch.long)
output_sampled = m.generate(test_input, max_new_tokens=100)[0].tolist()
print(decode(output_sampled))


Ong h hasbe pave pirance
RDe hicomyonthar's
PES:
AKEd ith henourzincenonthioneir thondy, y heltieien


### THE MATHEMATICS OF SELF-ATTENTION

In [13]:
# consider the following example
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [14]:
# We average the channels from the previous steps to create a relation among the tokens -> a weak form of aggregation of context
# i.e., x[b, t] = mean{i<=t} x[b,i]
x_bow = torch.zeros((B, T, C)) # Bag of words (averaged)
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1]
        x_bow[b, t] = torch.mean(x_prev, 0)
x[0], x_bow[0]

(tensor([[-0.4278, -1.4516],
         [ 0.5260, -0.0256],
         [-0.7148,  1.2601],
         [-1.6537,  0.7832],
         [-1.1372, -1.1862],
         [-0.9035, -1.1036],
         [-0.6181,  0.4654],
         [ 0.2016, -0.5653]]),
 tensor([[-0.4278, -1.4516],
         [ 0.0491, -0.7386],
         [-0.2055, -0.0724],
         [-0.5676,  0.1415],
         [-0.6815, -0.1240],
         [-0.7185, -0.2873],
         [-0.7042, -0.1798],
         [-0.5909, -0.2280]]))

In [15]:
# Trick = Matrix Multiplication
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x# (B, T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(x_bow, xbow2)

True

In [16]:
## Another more efficient method - use softmax for normalization !
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(x_bow, xbow3)

True

In [17]:
## Implementing the attention mechanism
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Single head attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])