In [None]:
%%capture
!wget https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
names = open('names.txt').read().splitlines()

# tokenizer
vocab = sorted(set(''.join(names) + '.'))
vocab_size = len(vocab)
stoi = {v:k for k, v in enumerate(vocab)}
itos = {v:k for k, v in stoi.items()}

def decode(seq: list[int]) -> str:
    return ''.join([itos[i] for i in seq])

def encode(name: str) -> list[int]:
    return [stoi[s] for s in name]

In [None]:
from torch.utils.data import Dataset, TensorDataset, DataLoader

from torch.utils.data import Dataset, TensorDataset, DataLoader

X = []
Y = []

for name in names:
    name = '.' + name + '.'
    name = encode(name)
    X.append(name[:-1])
    Y.append(name[1:])

In [None]:
X[:5]

[[0, 5, 13, 13, 1],
 [0, 15, 12, 9, 22, 9, 1],
 [0, 1, 22, 1],
 [0, 9, 19, 1, 2, 5, 12, 12, 1],
 [0, 19, 15, 16, 8, 9, 1]]

In [None]:
Y[:5]

[[5, 13, 13, 1, 0],
 [15, 12, 9, 22, 9, 1, 0],
 [1, 22, 1, 0],
 [9, 19, 1, 2, 5, 12, 12, 1, 0],
 [19, 15, 16, 8, 9, 1, 0]]

In [None]:
x = [0, 5, 13, 13, 1]
t = 4
x[:t]

[0, 5, 13, 13]

In [None]:
n_embd = 4

# Model init
E = torch.randn(vocab_size, n_embd)
W1 = torch.randn(n_embd, 10)
b1 = torch.randn(10)
W2 = torch.randn(10, 20)
b2 = torch.randn(20)

In [None]:
# Model forward
x = [0, 5, 13, 13, 1]
output = []

h0 = torch.zeros(1, 10) # hidden state at time 0

h1 = torch.tanh(E[x[0]] @ W1 + b1) + h0 # hidden state at time 1
o1 = h1 @ W2 + b2 # output at time 1
output.append(o1[0])

In [None]:
h2 = torch.tanh(E[x[1]] @ W1 + b1) + h1 # hidden state at time 2
o2 = h2 @ W2 + b2 # output at time 2
output.append(o2[0])

In [None]:
h3 = torch.tanh(E[x[2]] @ W1 + b1) + h2 # hidden state at time 3
o3 = h3 @ W2 + b2 # output at time 3
output.append(o3[0])

In [None]:
output

[tensor([ 0.4669,  0.6761, -3.0534,  4.0401, -2.0252, -0.6030, -4.7145, -1.1758,
         -2.9580, -1.3555, -0.4534,  0.7445,  0.2072,  1.1551,  2.4621,  2.8646,
          0.8427, -3.0760, -6.9223, -3.2064]),
 tensor([ -0.0721,   1.9245,  -4.9851,   1.5987,  -4.8473,  -1.1260, -12.2899,
           1.3603,  -2.3253,  -2.3572,  -2.7218,   0.1825,  -1.3522,   0.3771,
           5.8657,   3.4324,   0.3699,  -2.5818, -10.0213,  -3.8439]),
 tensor([  0.7521,   0.1791,  -4.7636,   1.3742,  -8.6670,   0.2180, -13.1483,
           2.4508,  -3.7205,  -0.3108,  -2.5391,  -0.2514,  -1.1757,  -0.9017,
           3.9833,   9.8831,  -2.7657,  -3.9826, -17.2085,  -8.0013])]

In [None]:
E[x]

tensor([[ 0.0868, -0.0398, -1.0034,  0.8291],
        [ 1.0975,  0.5847, -0.6764,  0.4758],
        [-0.3673, -0.9957,  0.8612,  0.9166]])

In [None]:
 -> One hot encoding
10: olma -> Embedding vector

Bu olma -> Context vector
Buni olma


In [None]:
train_size = int(.8 * X.shape[0])
Xtr, Xts = X[:train_size], X[train_size:]
Ytr, Yts = Y[:train_size], Y[train_size:]

Dtr = TensorDataset(Xtr, Ytr)
Dts = TensorDataset(Xts, Yts)

In [None]:
DLtr = DataLoader(Dtr, batch_size=32, shuffle=True, drop_last=True)
DLts = DataLoader(Dts, batch_size=32, shuffle=False, drop_last=False)

In [None]:
class MLP1(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb = nn.Embedding(config['vocab_size'], config['n_embd'])
        self.fc = nn.Linear(config['n_embd'], config['vocab_size'])

    def forward(self, x):
        x = self.emb(x)
        x = self.fc(x)
        return x

In [None]:
class MLP2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb = nn.Embedding(config['vocab_size'], config['n_embd'])
        self.flat = nn.Flatten(start_dim=3)
        self.fc = nn.Linear(config['context_size'] * config['n_embd'], config['vocab_size'])
        self.tanh = nn.Tanh()
        self.out = nn.Linear(config['vocab_size'], config['vocab_size'])

    def forward(self, x):
        x = self.emb(x)
        x = self.flat(x)
        x = self.fc(x)
        x = self.tanh(x)
        x = self.out(x)
        return x


In [None]:
block_size = 8
model1 = MLP1({"vocab_size": vocab_size, "n_embd": 16, "context_size": block_size})
# model2 = MLP2({"vocab_size": vocab_size, "n_embd": 16, "context_size": block_size})

In [None]:
for x, y in DLtr:
    break

In [None]:
xtest = torch.randint(0, 27, (1, 30))
logits = model1(xtest)
print(xtest)
print(logits.shape)

tensor([[26, 24,  0, 15,  4, 24,  7, 22,  8, 10,  8, 23, 19, 17, 19, 20,  2, 18,
         11, 14, 18, 22,  7, 20,  9,  3, 25, 16,  3,  4]])
torch.Size([1, 30, 27])


In [None]:
model1.fc.weight @ model1.emb.weight[24] + model1.fc.bias

tensor([-0.5557, -0.2012,  0.0194,  0.6211,  0.5542,  0.0107,  0.5560, -0.8848,
         0.1130, -0.1375, -0.4019,  0.3113,  0.1812,  0.0145, -0.3206,  0.2724,
         0.4117,  0.2851, -0.3671,  0.4658, -0.1074, -0.2541, -0.1560, -0.1230,
        -0.5247,  0.1549, -0.4388], grad_fn=<AddBackward0>)

In [None]:
logits[0, 1]

tensor([-0.5557, -0.2012,  0.0194,  0.6211,  0.5542,  0.0107,  0.5560, -0.8848,
         0.1130, -0.1375, -0.4019,  0.3113,  0.1812,  0.0145, -0.3206,  0.2724,
         0.4117,  0.2851, -0.3671,  0.4658, -0.1074, -0.2541, -0.1560, -0.1230,
        -0.5247,  0.1549, -0.4388], grad_fn=<SelectBackward0>)

In [None]:
model1(torch.tensor([[24]]))

tensor([[[-0.5557, -0.2012,  0.0194,  0.6211,  0.5542,  0.0107,  0.5560,
          -0.8848,  0.1130, -0.1375, -0.4019,  0.3113,  0.1812,  0.0145,
          -0.3206,  0.2724,  0.4117,  0.2851, -0.3671,  0.4658, -0.1074,
          -0.2541, -0.1560, -0.1230, -0.5247,  0.1549, -0.4388]]],
       grad_fn=<ViewBackward0>)

In [None]:
emma.

a

In [None]:
block_size = 3
for name in names[:4]:
    context = [0]
    for ch in name + '.':
        ix = stoi[ch]
        print(f"{context} -> {[ix]}")
        context = context + [ix]

[0] -> [5]
[0, 5] -> [13]
[0, 5, 13] -> [13]
[0, 5, 13, 13] -> [1]
[0, 5, 13, 13, 1] -> [0]
[0] -> [15]
[0, 15] -> [12]
[0, 15, 12] -> [9]
[0, 15, 12, 9] -> [22]
[0, 15, 12, 9, 22] -> [9]
[0, 15, 12, 9, 22, 9] -> [1]
[0, 15, 12, 9, 22, 9, 1] -> [0]
[0] -> [1]
[0, 1] -> [22]
[0, 1, 22] -> [1]
[0, 1, 22, 1] -> [0]
[0] -> [9]
[0, 9] -> [19]
[0, 9, 19] -> [1]
[0, 9, 19, 1] -> [2]
[0, 9, 19, 1, 2] -> [5]
[0, 9, 19, 1, 2, 5] -> [12]
[0, 9, 19, 1, 2, 5, 12] -> [12]
[0, 9, 19, 1, 2, 5, 12, 12] -> [1]
[0, 9, 19, 1, 2, 5, 12, 12, 1] -> [0]
