In [44]:
import torch
import torch.nn as nn

In [64]:
# hyperparameters
block_size = 6
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16
n_embd = 3

In [7]:
text = open('../../data/shakespeare.txt', 'r', encoding="utf-8").read()
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [11]:
chars = sorted(set(text))
chars[:15]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B']

In [18]:
itoc = {i: c for i, c in enumerate(chars)}
ctoi = {c: i for i, c in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[c] for c in l])

In [28]:
data = torch.tensor(encode(text))
n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [41]:
def get_batch(split):
    
    data = train_data if split == 'train' else test_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    
    X = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    
    X, y = X.to(device), y.to(device)
    
    return X, y
    
X, y = get_batch('train')

In [70]:
class attention(nn.Module):
    
    def __init__(self):
        super().__init__()
        head_size = 5
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        
    def forward(self, x):
        
        B,T,C = x.shape
        
        k = self.key(x)
        q = self.query(x)
        
        wei = q @ k.transpose(-2, -1) * C**-0.5
        
        wei = torch.softmax(wei, dim=-1)
        
        v = self.value(x)
        
        x = wei @ v
        
        return x
        
att = attention()

X = torch.rand(1,block_size, n_embd) # B,T,C = 1, 6, 3

print(X)
print()

print(att(X)) # B,T,C = 1, 6, 5

tensor([[[0.2209, 0.3995, 0.7796],
         [0.9525, 0.5422, 0.9862],
         [0.3222, 0.3460, 0.2547],
         [0.7180, 0.7180, 0.7628],
         [0.7975, 0.6919, 0.3358],
         [0.7224, 0.0723, 0.4874]]])

tensor([[[ 0.0724,  0.0988, -0.3838, -0.1094,  0.5333],
         [ 0.0737,  0.0981, -0.3884, -0.1087,  0.5379],
         [ 0.0751,  0.0937, -0.3854, -0.1037,  0.5294],
         [ 0.0733,  0.0972, -0.3864, -0.1076,  0.5351],
         [ 0.0756,  0.0930, -0.3877, -0.1029,  0.5318],
         [ 0.0764,  0.0940, -0.3888, -0.1041,  0.5329]]],
       grad_fn=<UnsafeViewBackward0>)


In [53]:
C = 3
C**-0.5

0.5773502691896257

In [43]:
# @torch.no_grad()
# def estimate_loss():
#     outs = {}
#     for split in ['train', 'val']:
#         losses = torch.zeros()

torch.Size([16, 128])