In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1555)

<torch._C.Generator at 0x10d5576d0>

In [11]:
torch.cuda.is_available()

False

In [10]:
embedding = nn.Embedding(10, 3)
input = torch.LongTensor([[1, 2, 3, 4], [5, 6, 7, 8]])

print(embedding(input))

tensor([[[ 0.7425,  0.3908, -0.7371],
         [-0.0140, -1.7875, -1.1936],
         [-1.5519,  0.9916,  0.0361],
         [ 0.1350, -1.7223,  0.4365]],

        [[-1.4478,  0.0525, -0.3893],
         [ 2.7292, -0.3402, -0.7701],
         [ 0.3766, -1.1537, -0.3654],
         [-0.5021, -0.8938,  0.8260]]], grad_fn=<EmbeddingBackward0>)


In [14]:
with open('data/hp/01 Harry Potter and the Sorcerers Stone.txt', 'r', encoding='utf-8') as f:
    text = f.read()


In [15]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)

In [16]:
print( len(vocab) )
print( vocab )
print(''.join(vocab))

82
['\n', ' ', '!', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '–', '—', '‘', '’', '“', '”', '…']

 !'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz–—‘’“”…


In [17]:
char2token = {c : i for i, c in enumerate(vocab)}
token2char = {i : c for i, c in enumerate(vocab)}
encode = lambda s: [char2token[c] for c in s]
decode = lambda s: ''.join([token2char[int(t)] for t in s])


In [18]:
data = torch.tensor(encode(text), dtype=torch.long)
n = len(data)

train_data = data[:int(0.9 * n)]
val_data = data[int(0.9 * n) :]

In [19]:
block_size = 8
x = train_data[: block_size]
y = train_data[1: block_size + 1]

print(train_data[: block_size + 1])
for i in range(block_size):
    context = x[: i + 1]
    target  = y[i]
    print(f'When context: {context} target: {target}')

tensor([35,  1, 66,  9,  1, 49, 62, 52,  1])
When context: tensor([35]) target: 1
When context: tensor([35,  1]) target: 66
When context: tensor([35,  1, 66]) target: 9
When context: tensor([35,  1, 66,  9]) target: 1
When context: tensor([35,  1, 66,  9,  1]) target: 49
When context: tensor([35,  1, 66,  9,  1, 49]) target: 62
When context: tensor([35,  1, 66,  9,  1, 49, 62]) target: 52
When context: tensor([35,  1, 66,  9,  1, 49, 62, 52]) target: 1


In [20]:
def get_batch(data, batch_size, block_size):
    """
    Args:
        data: (torch.tensor) dataset
        batch_size: (int) size of the batch
        block_size: (int) lenght of the context
    """
    indx = torch.randint(len(data) - block_size, (batch_size, ))
    x_batch = torch.stack([data[i : i + block_size] for i in indx])
    y_batch = torch.stack([data[i + 1 : i + block_size + 1] for i in indx])
    return x_batch, y_batch

In [23]:
batch_size = 4
x_batch, y_batch = get_batch(data, batch_size, block_size)

print('input:')
print(x_batch)
print('output:')
print(y_batch)

input:
tensor([[ 1, 71, 49, 67,  1, 71, 56, 49],
        [53, 52,  1, 57, 68, 78, 67,  1],
        [63, 69,  8, 33, 62, 63, 71,  8],
        [57, 62, 68, 60, 73,  9,  0,  0]])
output:
tensor([[71, 49, 67,  1, 71, 56, 49, 68],
        [52,  1, 57, 68, 78, 67,  1, 49],
        [69,  8, 33, 62, 63, 71,  8, 45],
        [62, 68, 60, 73,  9,  0,  0, 79]])


In [28]:
context = torch.zeros((1, 1), dtype=torch.long)

print(context[:, -8:])
print()

tensor([[0]])



In [59]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        print(self.token_embedding_table)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx,  max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(x_batch, y_batch)
print(logits.shape)
print(loss)
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

Embedding(82, 82)
torch.Size([32, 82])
tensor(4.7846, grad_fn=<NllLossBackward0>)

j!F2')8gnL0—3brlA”8wDhy:‘DSKrHtq3”pq7TWmVbh…G….hDuQ4QBP2G’)–AO-E'H*g—5 :xnY…Q.HCyuDf5gn“qNXluhZa.ulp


# Mathematical representation of self attention mechanism

In [73]:
B, T, C = 4, 8, 2 # batch size, context depth, number of chanels
x = torch.randn((B, T, C))
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
# wightened aggregation of the previous context (i.e. previos tokens)
x_agr = wei @ x

print(tril)
print(wei)

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


# Self attention mechanism

Each token emits two vectors: query and key vectors.

- Query what am I looking for 
- Key what do I contain
 

In [5]:
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)

# Single head self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ x

print(out.shape)
print(out[0])

torch.Size([4, 8, 32])
tensor([[ 2.0432,  0.3536, -1.2293, -0.7704, -0.9040, -0.7010,  0.6719,  0.5030,
         -0.8689,  0.1290,  0.6432, -1.4434, -0.8084, -0.0971,  0.4020, -0.0624,
         -0.5644, -0.8798, -1.0318, -0.3393, -0.3511, -0.6505, -1.8884,  0.3673,
          0.2802,  1.2485,  2.0518, -1.4728,  1.6619, -1.2283, -0.1357,  0.5455],
        [ 2.0141,  0.3463, -1.2036, -0.7369, -0.8787, -0.7117,  0.6694,  0.4901,
         -0.8402,  0.1212,  0.6370, -1.4216, -0.8078, -0.1061,  0.3866, -0.0469,
         -0.5625, -0.8481, -1.0406, -0.3185, -0.3498, -0.6262, -1.8810,  0.3752,
          0.2908,  1.2451,  2.0329, -1.4735,  1.6322, -1.1828, -0.1164,  0.5229],
        [ 0.9951,  0.3647, -0.8058, -0.9011,  0.4741, -0.5256,  0.0523,  0.5753,
         -1.8102, -0.1692, -0.0524, -0.6945,  0.0931,  0.1007,  0.6411,  0.4191,
         -1.4607, -0.5826, -0.1995,  0.3849, -0.5290,  0.0901, -1.1662,  0.6040,
         -0.1256, -0.3199,  0.3977, -0.2008,  0.8341,  0.9137,  0.7768,  0.7315],
  