In [7]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
words = open('../names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [6]:
len(words)

32033

In [19]:
# build the vocabulary of characters and mappings to/from integers
import string
START_STOP_TOKEN = '.'
tokens = [START_STOP_TOKEN, *string.ascii_lowercase]
stoi = { s: i for i, s in enumerate(tokens) }
itos = { i: s for s, i in stoi.items() }
num_tokens = len(tokens)
print(itos)

{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [364]:
BLOCK_SIZE = 3 # context length; how many chars do we use to predict the next one?
def build_dataset(words):
    X, Y = [], []
    for w in words:
        #print(); print(w)
        context = [0] * BLOCK_SIZE
        for ch in w + START_STOP_TOKEN:
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '-->', itos[ix])
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

# -- without split --
#X, Y = build_dataset(words)

# -- with train, dev, test split --
import random
random.seed(42)
shuffled_words = words[:]
random.shuffle(shuffled_words)

n1 = int(0.8*len(shuffled_words))
n2 = int(0.9*len(shuffled_words))

print('trainset:', end=' ')
Xtr, Ytr = build_dataset(shuffled_words[:n1])
print('devset  :', end=' ')
Xdev, Ydev = build_dataset(shuffled_words[n1:n2])
print('testset :', end=' ')
Xte, Yte = build_dataset(shuffled_words[n2:])

trainset: torch.Size([182625, 3]) torch.Size([182625])
devset  : torch.Size([22655, 3]) torch.Size([22655])
testset : torch.Size([22866, 3]) torch.Size([22866])


In [322]:
print('X[13,2]    =', X[13,2].item())
print('C[1]       =', C[1])
print('C[X[13,2]] =', C[X[13,2]])
print('C[X][13,2] =', C[X][13,2])

X[13,2]    = 1
C[1]       = tensor([-1.4027, -0.6303])
C[X[13,2]] = tensor([-1.4027, -0.6303])
C[X][13,2] = tensor([-1.4027, -0.6303])


In [172]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [139]:
hidden_layer_size = 100
W1 = torch.randn((6, hidden_layer_size))
b1 = torch.randn(hidden_layer_size)

In [140]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [141]:
h.shape

torch.Size([32, 100])

In [142]:
W2 = torch.randn((hidden_layer_size, num_tokens))
b2 = torch.randn((num_tokens))

In [143]:
logits = h @ W2 + b2

In [145]:
logits.shape

torch.Size([32, 27])

In [171]:
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
probs.shape, probs[0].sum()

(torch.Size([32, 27]), tensor(1.))

In [170]:
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(19.2251)

In [173]:
# ------- cleanup -------

In [324]:
# dataset
Xtr.shape, Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [406]:
EMB_DIM_SIZE = 8
hidden_layer_size = 300
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((num_tokens, EMB_DIM_SIZE), generator=g)
W1 = torch.randn((BLOCK_SIZE * EMB_DIM_SIZE, hidden_layer_size), generator=g)
b1 = torch.randn(hidden_layer_size, generator=g)
W2 = torch.randn((hidden_layer_size, num_tokens), generator=g)
b2 = torch.randn((num_tokens), generator=g)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
print('num params:', sum(p.nelement() for p in parameters))

num params: 15843


In [366]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
#plt.plot(lrs)

In [400]:
lri = []
lossi = []
stepi = []

In [407]:
MINIBATCH_SIZE = 64
print('Context length:', BLOCK_SIZE)
print('Number of embedding dims:', EMB_DIM_SIZE)
print('Minibatch size:', MINIBATCH_SIZE)
print('Hidden layer size:', hidden_layer_size)
print('Number of params:', sum(p.nelement() for p in parameters))
print('-------------------------------')
print()

# for i,lr in enumerate(lrs):
for i in range(200_000):
    # -- get indices for minibactch --
    ix = torch.randint(0, Xtr.shape[0], (MINIBATCH_SIZE,))
    x, y = Xtr[ix], Ytr[ix]
    
    # -- forward pass --
    emb = C[x]
    h = torch.tanh(emb.view(-1, BLOCK_SIZE * EMB_DIM_SIZE) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    
    # -- backward pass --
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # -- update --
    lr = 0.1 if i < 100_000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # -- track stats --
    # lri.append(lr)
    # lossi.append(loss.log10().item())
    # stepi.append(i)

    if (i+1) % 10_000 == 0:
        print(f'{100*(i / 200_000.0):0.0f}% -- loss: {loss.item():0.4f}')

print(f'loss: {loss.item():.4f}')

Context length: 3
Number of embedding dims: 8
Minibatch size: 64
Hidden layer size: 300
Number of params: 15843
-------------------------------

0%
5%
10%
15%
20%
25%
30%
35%
40%
45%
50%
55%
60%
65%
70%
75%
80%
85%
90%
95%
loss: 2.0622


In [381]:
#plt.plot(stepi, lossi)

In [382]:
#plt.plot(lri, lossi)

In [415]:
emb = C[Xtr]
h = torch.tanh(emb.view(-1, BLOCK_SIZE * EMB_DIM_SIZE) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
print(f'train loss: {loss.item():0.4f}')

train loss: 2.1134


In [416]:
emb = C[Xdev]
h = torch.tanh(emb.view(-1, BLOCK_SIZE * EMB_DIM_SIZE) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(f'dev loss: {loss.item():0.4f}')

dev loss: 2.1572


In [412]:
#c_dims = (1,4)
for i in range(1, EMB_DIM_SIZE):
    continue
    c_dims = (0,i)
    plt.figure(figsize=(4,4))
    plt.scatter(C[:,c_dims[0]].data, C[:,c_dims[1]].data, s=100)
    for i in range(C.shape[0]):
        plt.text(C[i,c_dims[0]].item(), C[i,c_dims[1]].item(), itos[i], ha='center', va='center', color='white')
    plt.grid('minor')

In [420]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * BLOCK_SIZE
    while True:
        emb = C[torch.tensor([context])] # (1, block_size, d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))

carmahzati.
hari.
kimree.
thil.
halaysleer.
huth.
delynn.
jareei.
ner.
kiah.
maiif.
kaleigh.
ham.
joce.
quinn.
suline.
livabi.
waje.
ogiearyxin.
kaelynn.
