In [43]:
import torch
import torch.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [44]:
words = open("names.txt").read().splitlines()

In [45]:
len(words)

32033

In [46]:
chars = sorted(list(set(''.join(words))))
vocab_size = len(chars) + 1
# encoding and decoding chars
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [47]:
# build dataset
X, Y = [], []
block_size = 3 # size of context for predictions

for word in words[:5]:
    context = [0] * block_size
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [48]:
X[0], Y[0]

(tensor([0, 0, 0]), tensor(5))

In [49]:
C = torch.randn(vocab_size, 2)

In [50]:
C.shape

torch.Size([27, 2])

In [51]:
X[13], C[X][13]

(tensor([0, 0, 1]),
 tensor([[ 0.4622,  1.0643],
         [ 0.4622,  1.0643],
         [-0.0024,  1.0208]]))

In [52]:
C[X][0]

tensor([[0.4622, 1.0643],
        [0.4622, 1.0643],
        [0.4622, 1.0643]])

In [55]:
tok_emb = C[X]
tok_emb.shape

torch.Size([32, 3, 2])

In [56]:
# first layer
W1 = torch.randn(6, 100)
b = torch.randn(100)

In [57]:
# one way to reshape to a 32 x 6 matrix
torch.cat(torch.unbind(tok_emb,1), 1)
# more effeciency way
tok_emb.view(-1, 6) # -1 means infer the size which is 32

tensor([[ 0.4622,  1.0643,  0.4622,  1.0643,  0.4622,  1.0643],
        [ 0.4622,  1.0643,  0.4622,  1.0643,  0.1740, -1.1365],
        [ 0.4622,  1.0643,  0.1740, -1.1365, -0.8293,  1.7618],
        [ 0.1740, -1.1365, -0.8293,  1.7618, -0.8293,  1.7618],
        [-0.8293,  1.7618, -0.8293,  1.7618, -0.0024,  1.0208],
        [ 0.4622,  1.0643,  0.4622,  1.0643,  0.4622,  1.0643],
        [ 0.4622,  1.0643,  0.4622,  1.0643,  1.1308, -0.3119],
        [ 0.4622,  1.0643,  1.1308, -0.3119, -0.5596, -0.4138],
        [ 1.1308, -0.3119, -0.5596, -0.4138,  0.7133,  0.5133],
        [-0.5596, -0.4138,  0.7133,  0.5133,  0.5057, -2.2215],
        [ 0.7133,  0.5133,  0.5057, -2.2215,  0.7133,  0.5133],
        [ 0.5057, -2.2215,  0.7133,  0.5133, -0.0024,  1.0208],
        [ 0.4622,  1.0643,  0.4622,  1.0643,  0.4622,  1.0643],
        [ 0.4622,  1.0643,  0.4622,  1.0643, -0.0024,  1.0208],
        [ 0.4622,  1.0643, -0.0024,  1.0208,  0.5057, -2.2215],
        [-0.0024,  1.0208,  0.5057, -2.2

In [58]:
h = torch.tanh(tok_emb.view(-1, 6) @ W1 + b)

In [59]:
# output layer
W2 = torch.randn(100, vocab_size)
b2 = torch.randn(vocab_size)

In [60]:
# calculate the logits and normalize to probabilities
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(-1, keepdim=True)

In [62]:
# calculate the loss
loss = -prob[torch.arange(len(Y)), Y].log().mean()

torch.Size([32, 27])