## GPT : Generative Pretrained Transformer, Self Attention Mechanism
##### It completes the sequence, so it is a langugage model in that sense
##### Its core is based on paper--Attention is all you need--that proposed Transformer Architecture
##### Here we are focusing on traning transformer char level based language model

In [9]:
# read it in to inspect it
with open('shakespear.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [10]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1125396


In [11]:
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [13]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [14]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [None]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1125396]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [18]:
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

#### We work with chunks of dataset, and for training transformer we sample random chunks from trainset and train it chunks at a time. Chunks have maximum length referred to as block size

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

##### Each chunk has multiple examples packed into it: [18] -> [47];  [18, 47] -> [56]. Part of input text beyond block_size is truncated.

In [20]:
x =  train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random offset into train set
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 1, 46, 47, 51, 11,  1, 21,  1],
        [ 1, 57, 39, 47, 52, 58,  1, 61],
        [43,  1, 51, 59, 57, 58,  1, 40],
        [ 1, 57, 39, 63,  1, 58, 46, 43]])
targets:
torch.Size([4, 8])
tensor([[46, 47, 51, 11,  1, 21,  1, 46],
        [57, 39, 47, 52, 58,  1, 61, 46],
        [ 1, 51, 59, 57, 58,  1, 40, 43],
        [57, 39, 63,  1, 58, 46, 43, 43]])
----
when input is [1] the target: 46
when input is [1, 46] the target: 47
when input is [1, 46, 47] the target: 51
when input is [1, 46, 47, 51] the target: 11
when input is [1, 46, 47, 51, 11] the target: 1
when input is [1, 46, 47, 51, 11, 1] the target: 21
when input is [1, 46, 47, 51, 11, 1, 21] the target: 1
when input is [1, 46, 47, 51, 11, 1, 21, 1] the target: 46
when input is [1] the target: 57
when input is [1, 57] the target: 39
when input is [1, 57, 39] the target: 47
when input is [1, 57, 39, 47] the target: 52
when input is [1, 57, 39, 47, 52] the target: 58
when input is [1, 57, 39, 4

In [38]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        if targets == None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        # take this (B,T) and extend it to (B,T+1) ... (B, T+new_tokens)
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [39]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss.item())

torch.Size([32, 65])
4.7820000648498535


In [40]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [42]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [47]:
batch_size = 32
for steps in range(10000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4941189289093018


In [63]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


O aranthar h hy knty
Bou h ckeim tisathind ig s wend a!
Cangerusbyoteretais?
por poraslemas, TESh! I d!
Haligur!
So det
AMBatrbrveioaing nll bers when pato on
Frcouce, popatt trin ancas;
e pitthe is ifr, n, in, tthanolshetrese wo!

Myor dos o he d bangn isire wo h, t bed s, d It pu watindh,

Thohepadicoendend
AUn bellorore:
WIs knd! ou herot, re the wes tarckir Ben t fand su win, h 'dim fay 's'sy POfr.

G'de hevilliforo f spr mary ak,
Win, llith aby s g ttoou sth dwsiaunderingl t'd pico nowid pf


#### So far we are only looking at last character to predict next character, our context length is too small

In [86]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [87]:
# summarizing context by just averaging
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [94]:
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
# (T,T) @ (B,T,C) 
# broadcasted (B,T,T) @ (B,T,C) ---> (B,T,C), batche matrix multiply
xbow2 = wei @ x
torch.allclose(xbow[3], xbow2[3])

True

##### Weighted aggregation of past tokens implemented using matrix multiplication of lower triangular matrix
##### It is used for developing self attention block

In [None]:
tril = torch.tril(torch.ones(T,T))

# Interaction strength, it tells us how much of the past token we want to average up
# This is updated based on data, so tokens will start looking at each other and some 
# tokens will find other tokens more or less intereting to different amounts dependig 
# on the value present
# It is initialized to be zero.
wei = torch.zeros(T,T)

# Don't aggregate anything from future, we only look at previous tokens
wei = wei.masked_fill(tril == 0, float('-inf'))

wei = F.softmax(wei, dim=-1)

xbow3 = wei @ x

### Self Attention (Single Head)
#### Different tokens will find other tokens more or less interesting, and we want that to be data dependent e.g. vowels looking for consonants. So we want info to flow from previous tokens but is a data dependent manner.
#### Every single token emits two vectors: 1. Key 2. Query
#### Query : What am I looking for (content i am looking for). 
#### Key : What do I contain (token content)
#### We get affinities between tokens is through dot product of Key and Query.
#### For this case, Query of current token dot products with Key of previous tokens. We get Wei matrix.
#### Instead of using actual input tokens we use calculate Value(x), we can think of x as some king of pvt info to the token.
#### So, info for given token is kept in x, and for single head attention Query q indicates what it is interested in, Key k is what it has, and it communicates Value v to anyone finding it interesting. 
#### So v = Value(x) is what gets aggregated between different tokens for the purpose of the single head attention

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder block which encodes some context that we like to condition on) 
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [None]:
# Self Attention (Decoder block)
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# Single head of self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x) # (B,T,head_size)
k = key(x) # (B,T,head_size)

wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) --> (B,T,T)

# Interaction strength, it tells us how much of the past token we want to average up
# This is updated based on data, so tokens will start looking at each other and some 
# tokens will find other tokens more or less intereting to different amounts dependig 
# on the value present.
# It is initialized to be zero.
# wei = torch.zeros(T,T)

# Don't aggregate anything from future, we only look at previous tokens
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # removed in enoder block

# normalize rows
wei = F.softmax(wei, dim=-1)

v = value(x) # (B,T,head_size)
out = wei @ v # (B,T,T) @ (B,T,head_size)
#out = wei @ x # simple averaging

out.shape

torch.Size([4, 8, 16])

In [100]:
wei[0]

tensor([[-1.7629, -1.3011,  0.5652,  2.1616, -1.0674,  1.9632,  1.0765, -0.4530],
        [-3.3334, -1.6556,  0.1040,  3.3782, -2.1825,  1.0415, -0.0557,  0.2927],
        [-1.0226, -1.2606,  0.0762, -0.3813, -0.9843, -1.4303,  0.0749, -0.9547],
        [ 0.7836, -0.8014, -0.3368, -0.8496, -0.5602, -1.1701, -1.2927, -1.0260],
        [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363,  0.8638,  0.3719,  0.9258],
        [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449, -2.5229,  1.4187,  1.2196],
        [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091,  1.2616, -0.5484,  0.8048],
        [-1.8044, -0.4126, -0.8306,  0.5898, -0.7987, -0.5856,  0.6433,  0.6303]],
       grad_fn=<SelectBackward0>)

## Implementataion (GPT)

##### This is decoder only transformer with no cross attention and encoder block present, it only has self attention and feed forward and layer norm
##### It is so because we are just generating text unconditioned on anything, what makes it a decoder is that we are using triangular mask in transformer self attention, hence it has auto regressive nature.
##### Encoder would be needed in setting like language translation. The generation of translated text in target language is conditioned on input text in source language. We would create tokens from it and put a transformer on it without triangular masking, essentially sncoding the input text.
##### Keys and Values come from what we condition on, input source text in this case, and feed into cross attention block of decoder 

In [194]:
import torch
import torch.nn as nn
from torch.nn import functional as F 

torch.manual_seed(1337)

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

In [195]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute scaled attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        
        # Don't aggregate anything from future, we only look at previous tokens
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # randomly prevent some nodes from communicating
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [196]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) # projection layer going back into residual pathway
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # concatenating over chanel dimension
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [197]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        # multiplied by 4 as mentioned in paper
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # projection layer going back into residual pathway
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

### Skip Connections (residual connections)
#### With addition distributing gradient equally to all its branches, we have a gradient super highway that goes all the way to the input and the residual blocks are usually initialized in the beginning in a way that they contribute very little to residual pathway so they are almost absent in the begining.
#### During optimization residual blocks come online over time and start to contribute, but at least at the initialization gradient can go directly to input unimpeded and just flows.

### Dropout
#### A regularization technique, in which certain nodes are randomly dropped every train loop and trained without them. One can think of if like training ensemble of NNs. At test/inference time they are all merged.
#### Added right before the connection back into residual pathway

In [198]:
class TransformerBlock(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # Layer norm applied first here before x goes into self attention and feed forward
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [199]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        # 4 heads of n_embd//4 dimensional self attention
        # Each concatenated over channel dimensioin to produce 
        # (batch_size,block_size,n_embb) dimentional output
        # self.sa_heads = MultiHeadAttention(4, n_embd//4)

        self.transformer_blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)

        # Encoding position 0 to T-1 with t'th row containning embedding for that,
        # hence broadcasting across batches makes sense as embedding is added to 
        # corresponding token at correct position index in the sample text
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb
        #x = self.sa_heads(x)
        x = self.transformer_blocks(x) # (B,T,C)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets == None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        # take this (B,T) and extend it to (B,T+1) ... (B, T+new_tokens)
        for _ in range(max_new_tokens):
            idx_cond =  idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [200]:
model = BigramLanguageModel()
m = model.to(device)

In [201]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [202]:
batch_size = 32
for steps in range(5000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

1.677055835723877


In [203]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))



COMINIUS:
Would. Go:
Within much my veauous glove! neceiNntal:
O, thee you; nom togue
his away though I lream at a people; nor dee!
Wrongived by Cate, you as see town the burninen,
As thalk''d hrhow some up you do stange:
Is am with Watch of need was be leet mare I she,
Wrants is to tex'd; I hear mean out huse pray of thring me my amon to meun him, though.

GLOUCESTER:
Where sglave for Dusure: this glasse inded,
The comes nothines affectiblew rekelone's recan envour eye the sturs hone
as him ac
