In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
words = open('names.txt').read().splitlines()
vocab = list(set(''.join(words)))
vocab.sort()
print(words[:7])
print(vocab)

['amit', 'eric', 'kevin', 'moxa', 'parneet', 'nico', 'xinyan']
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [3]:
itos = {i+1:vocab[i] for i in range(len(vocab))}
stoi = {vocab[i]:i+1 for i in range(len(vocab))}
itos[0] = '.'
stoi['.'] = 0

print(itos)
print(stoi)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [4]:
# construct the dataset
def create_dataset(words, block_size = 3, do_print=False):
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for w in word + '.':
            X.append(context)
            Y.append(stoi[w])
            context = context[1:] + [stoi[w]]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    if do_print:
        print(X.shape)
        print(Y.shape)
        for row in range(X.shape[0]):
            xes = [itos[X[row][i].item()] for i in range(X.shape[1])]
            y = itos[Y[row].item()]
            print(f"prediction for {''.join(xes)}: {y}")
    return X,Y


In [5]:
create_dataset(words[:7], do_print=True)

torch.Size([41, 3])
torch.Size([41])
prediction for ...: a
prediction for ..a: m
prediction for .am: i
prediction for ami: t
prediction for mit: .
prediction for ...: e
prediction for ..e: r
prediction for .er: i
prediction for eri: c
prediction for ric: .
prediction for ...: k
prediction for ..k: e
prediction for .ke: v
prediction for kev: i
prediction for evi: n
prediction for vin: .
prediction for ...: m
prediction for ..m: o
prediction for .mo: x
prediction for mox: a
prediction for oxa: .
prediction for ...: p
prediction for ..p: a
prediction for .pa: r
prediction for par: n
prediction for arn: e
prediction for rne: e
prediction for nee: t
prediction for eet: .
prediction for ...: n
prediction for ..n: i
prediction for .ni: c
prediction for nic: o
prediction for ico: .
prediction for ...: x
prediction for ..x: i
prediction for .xi: n
prediction for xin: y
prediction for iny: a
prediction for nya: n
prediction for yan: .


(tensor([[ 0,  0,  0],
         [ 0,  0,  1],
         [ 0,  1, 13],
         [ 1, 13,  9],
         [13,  9, 20],
         [ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 18],
         [ 5, 18,  9],
         [18,  9,  3],
         [ 0,  0,  0],
         [ 0,  0, 11],
         [ 0, 11,  5],
         [11,  5, 22],
         [ 5, 22,  9],
         [22,  9, 14],
         [ 0,  0,  0],
         [ 0,  0, 13],
         [ 0, 13, 15],
         [13, 15, 24],
         [15, 24,  1],
         [ 0,  0,  0],
         [ 0,  0, 16],
         [ 0, 16,  1],
         [16,  1, 18],
         [ 1, 18, 14],
         [18, 14,  5],
         [14,  5,  5],
         [ 5,  5, 20],
         [ 0,  0,  0],
         [ 0,  0, 14],
         [ 0, 14,  9],
         [14,  9,  3],
         [ 9,  3, 15],
         [ 0,  0,  0],
         [ 0,  0, 24],
         [ 0, 24,  9],
         [24,  9, 14],
         [ 9, 14, 25],
         [14, 25,  1],
         [25,  1, 14]]),
 tensor([ 1, 13,  9, 20,  0,  5, 18,  9,  3,  0, 11,  5

In [6]:
# for purpose of demo, don't bother with train, val, test
block_size = 3
vocab_size = len(vocab) + 1
emb_size = 2 # how many embeddings do we want
num_hidden = 200 # how big of a hidden layer

X, Y = create_dataset(words)
C = torch.randn(vocab_size, emb_size, requires_grad=True)
W1 = torch.randn(emb_size * block_size, num_hidden, requires_grad=True)
b1 = torch.randn(num_hidden, requires_grad=True)
W2 = torch.randn(num_hidden, vocab_size, requires_grad=True)
b2 = torch.randn(vocab_size, requires_grad=True)

params = [C, W1, b1, W2, b2]
acc = 0
for p in params:
    acc = acc + p.numel()

print(f"num params: {acc}")

num params: 6881


In [7]:
print(X.shape)
print(X[0])
print(Y.shape)
print(Y[0])

ix = torch.randint(0, X.shape[0], (32,))
embs = C[X[ix]]
embs = embs.view(-1)
print(embs.shape)

torch.Size([228187, 3])
tensor([0, 0, 0])
torch.Size([228187])
tensor(1)
torch.Size([192])


In [8]:

def train(X, Y, batch_size=32, num_iters=10000):
    lossi = []
    for i in range(num_iters):
        ix = torch.randint(0, X.shape[0], (batch_size,))
        embs = C[X[ix]]
        preact = embs.view(batch_size, -1) @ W1 + b1
        h = torch.tanh(preact)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y[ix])
        lossi.append(loss.item())
        if i % 1000 == 0:
            print(loss.item())

        # backprop
        for p in params:
            p.grad = None
        loss.backward()

        # learn
        for p in params:
            p.data = p.grad *(-0.1) + p.data
    return lossi

In [9]:
lossi = train(X, Y);

28.121305465698242
3.3761239051818848
3.3347816467285156
2.8541080951690674
2.5937540531158447
2.5225226879119873
2.5181028842926025
2.710533380508423
2.5936741828918457
2.8997180461883545


In [10]:
# now generate and show the power of the MLP

def generate_names(count, should_print = False):
    for i in range(count):
        context = [0] * block_size
        newname = []
        while True:
            embs = C[context]
            preact = embs.view(1, -1) @ W1 + b1
            h = torch.tanh(preact)
            logits = h @ W2 + b2
            counts = logits.exp()
            probs = counts / counts.sum(1, keepdim=True)
            if should_print:
                top5 = torch.topk(probs, 5)
                # print(top5)
                top5vals = [i.item() for i in top5.values.squeeze()]
                top5lets = [itos[i.item()] for i in top5.indices.squeeze()]
                top5z = zip(top5vals, top5lets)
                for e in top5z:
                    print(e)
                print(f"------ top prob {top5lets[0]} ------")

            # we don't necessarily choose the top probability, we sample from the distribution
            nci = torch.multinomial(probs, 1, True).item()
            context = context[1:] + [nci]
            newname.append(itos[nci])
            if nci == 0:
                break
        print(''.join(newname))

In [11]:
generate_names(1, True)

(0.1292458474636078, 'k')
(0.0983208417892456, 'j')
(0.08199865370988846, 'e')
(0.07372533529996872, 'z')
(0.06550216674804688, 'd')
------ top prob k ------
(0.4711335003376007, 'e')
(0.2040032297372818, 'a')
(0.10118529945611954, 'i')
(0.0784405916929245, 'o')
(0.05000215768814087, 'y')
------ top prob e ------
(0.1711379885673523, 'l')
(0.15458017587661743, 'n')
(0.1257237046957016, 's')
(0.11900298297405243, 'e')
(0.05749349296092987, 'i')
------ top prob l ------
(0.32772204279899597, '.')
(0.1511790007352829, 'l')
(0.1402026116847992, 'n')
(0.06645282357931137, 's')
(0.06362863630056381, 'a')
------ top prob . ------
nae.
