In [11]:
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt --output input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0   415k      0  0:00:02  0:00:02 --:--:--  415k


In [12]:
with open('input.txt', 'r') as f:
    text = f.read()

In [13]:
print("length of chars in input: ", len(text))

length of chars in input:  1115394


In [15]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [16]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [17]:
# character encoding and decoding
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda x: [stoi[s] for s in x]
decode = lambda x: ''.join([itos[i] for i in x])

print(encode("hello world."))
print(decode(encode("hello world.")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42, 8]
hello world.


In [19]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [20]:
# split into train and val
n = int(0.9*len(data))
train_data = data[:n]
Val_data = data[n:]

In [21]:
block_size = 8    # context length
# the following chunk of data include block_size of data points. 
# In the sense that when 18 is the input, model has to predict 47.
# When [18, 47] is the input, model has to predict 56. When 
# [18, 47, 56] is the input, model has to predict 57... and so on.
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [25]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"when input is {context}, the target is {target}")

when input is tensor([18]), the target is 47
when input is tensor([18, 47]), the target is 56
when input is tensor([18, 47, 56]), the target is 57
when input is tensor([18, 47, 56, 57]), the target is 58
when input is tensor([18, 47, 56, 57, 58]), the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


This way of feeding all possible context lengths upto block_size allows the model to learn how to behave in any given context length upto block_size during inference

In [67]:
# creating batches of input and targets
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split, device="cpu"):
    data = train_data if split=="train" else val_data
    ix = torch.randint(len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


## Simple Bigram Model

In [68]:
torch.backends.mps.is_available()

True

In [88]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # a simple lookup table of vocab_size (keys) and vocab_size (value dim)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B,T) tensors of integers, ie: batch, time
        logits = self.token_embedding_table(idx) # outputs: (B,T,C)
        
        if targets==None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices for the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only on the last token (timestep)
            logits = logits[:, -1, :]  # becomes (B, C) from (B, T, C) (since targets wasnt passed, the shape is btc)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
            
        return idx
    
    
m = BigramLanguageModel(vocab_size)
xb, yb = get_batch("train")
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
        
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([256, 65])
tensor(4.6485, grad_fn=<NllLossBackward0>)

SP MgD&GM .3YCKf fRwaX$V&tt3s!muDn-oivDTV?s!!q.
pTQ3!uLT;ehcL.PJgOwW
RlyE$k!MXIBL;;FGZOrc!
jHA;Rq.?,


In [89]:
# optimizer for training
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [98]:
batch_size = 32

for steps in range(10000):
    xb, yb = get_batch("train")
    
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.3848743438720703


In [99]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


Wr

WAy s mee do mpoweraitass and

NCEve ser.
'sh ldy m.

GIONo ou gh f, f my o hoteato hit ypefowe the sthenafr Mourcuss, tur ich thaknthe leis fe thin anay
DUCambe thene h nd ld t thitr nothuso s phey?

Wenfasarin
BRYondfitho, grey.
TAUCourer oueangredw
BUCHextibl my, le ove d ispaprry fed iss
G che sshewhesthot br couse meagur inomasthayo he;
Ands'd, brourd d's fthece, co bes ithitinfivinod s owhtowe med ior hiss Gadortod ses bu k, d'thiloo ft my oorischotl d wheeyodsdust w If; r areshee I ncat.

The be awis. bu kne?
F ke Of f my bl t oard t I s, sutourd baint w
ICETheseno otos
NVINENGBu.
And cin?
lan oned'd
BETyold w, l amay,
So'dw,
KI:
Ifo th, thag w sto

Yororr, cke ewe ano suthertrmakigheitherbucl br pt t ase n s ate icod gal!--ar MONREL:


on tooury swno th heere o he:
tswot my; je iveepre tasent, d dod m, inthooriprellingnche; d F pelerindn---s sunthadealf d, yomarof mol s'le,

Yof, fad
ILe, angasurisund pe tus hy bedusoudesy bezer sar g pave.

My bugnoowamy d.
Asespunqu t we

# GPT

### mathematical trick for self attention that increases efficiency using vectorization

In [123]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [130]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b][:t+1]
        xbow[b,t] = torch.mean(xprev, 0)

In [133]:
tril = torch.tril(torch.ones((T,T)))
# we start with zero/constant affinity for future and past tokens
wei = torch.zeros((T,T))
# we set the future tokens affinity to -inf as they do not contribute to the prediciton
wei = wei.masked_fill(tril==0, float("-inf"))
# we normalize the weights across the context length
wei = F.softmax(wei, dim=-1)
# we get the average attention of all the past tokens for the current token as follows
# (T, T) @ (B, T, C) -> (B, T, T) @ (B, T, C) -> (B, T, C) (broadcasting happens in step 1)
xbow2 = wei @ x 

In [134]:
torch.allclose(xbow,xbow2)

True

### self-attention head

In [138]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# head params
head_size = 16  # H
key = nn.Linear(C, head_size, bias=False)    # weights for the key, ie: information possessed by the token
query = nn.Linear(C, head_size, bias=False)  # weights for the query, ie: information seeked by the token
value = nn.Linear(C, head_size, bias=False)  # weights for the value, ie: information communicated by the token
k = key(x) # (B, T, H) the input data is fed into the weights to get the key vector
q = query(x) # (B, T, H) the input data is fed into the weights to get the query vector
v = value(x) # (B, T, H) the input data is fed into the weights to get the value vector

wei = q @ k.transpose(-2, -1)  # (B, T, H) @ (B, H, T) -> (B, T, T)

tril = torch.tril(torch.ones((T,T)))
# Masking the future tokens
wei = wei.masked_fill(tril==0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)

out = wei @ v # # (B, T, T) @ (B, T, H) -> (B, T, H)

out.shape

torch.Size([4, 8, 16])

My Notes:

- This masking is done in the Decoder Attention block meaning we only look at previous tokens in an autoregressive manner. 

- In an Encoder Attention block we will have information flowing from  past and future for learning the knowledge base of data. This knowledge base is referred to in the Decoder block in the form of "Cross-Attention" ([query(x)@key(y)]value(y)) as opposed to "Self-Attention"([query(x)@key(x)]value(x)). 

- Encoder Attention block -> Self-Attention ; Past+Future context
- Decoder Attention block -> Cross-Attention ; Past context

Notes:

- Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.

- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.

- Each example across batch dimension is of course processed completely independently and never "talk" to each other

- In an "encoder" attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.

- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)

- "Scaled" attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [152]:
k = torch.randn((B, T, head_size))
q = torch.randn((B, T, head_size))
wei = q @ k.transpose(-2,-1) * head_size**-0.5

In [153]:
k.var()

tensor(1.0632)

In [154]:
q.var()

tensor(0.9891)

In [155]:
wei.var()

tensor(0.9755)

- Adding the scaling factor for "scaled attention" avoids the peaky average property of softmax. Softmax is very sensitive to scale, so two vectors with same uniform distribution but different scales can give very pointy/peaky softmax output for a higher scaled input.

- Softmax will converge towards One-Hot-Vectors, so it is important for the scaling to be uniform. 

In [156]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, 0.5, 0.2]), dim=-1)

tensor([0.1799, 0.1333, 0.2197, 0.2684, 0.1988])

In [161]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, 0.5, 0.2])*90, dim=-1)

tensor([2.3195e-16, 4.3596e-28, 1.5230e-08, 1.0000e+00, 1.8795e-12])