In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
# build the vocab of chars and mappings to/from ints
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)} # chars from 1-26 are alpha
stoi['.'] = 0 # special char to delineate begin/end of word
itos = {i:s for s,i in stoi.items()}
#print(itos)

In [4]:
block_size = 3 # context length, how many chars we take to predict the next one
LUT_dim = 10
batch_size = 32
training_loops = 200000
decay_at = 10000
hidden_layer_size = 200

In [5]:
# build the dataset (split into 3)

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size # padded tokens
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append
            
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    #print(X.shape,Y.shape)
    return X,Y

import random
random.seed(42)
random.shuffle(words) # randomly shuffle words (so unsorted)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1]) # build training dataset (words 80% of dataset)
Xdev, Ydev = build_dataset(words[n1:n2]) # build dev dataset (words 80-90, so 10%)
Xte, Yte = build_dataset(words[n2:]) # build test dataset (words 90-100, so last 10% of dataset)

In [6]:
#block_size * LUT_dim

In [7]:
# params
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27,LUT_dim), generator=g) # puts 27 chars in 10 dims
W1 = torch.randn((block_size * LUT_dim ,hidden_layer_size), generator=g) 
b1 = torch.randn(hidden_layer_size, generator=g) # will be broadcasted when added to W1
W2 = torch.randn((hidden_layer_size,27), generator=g) # 200 outputs from hidden layer, 27 outputs (27 chars possible)
b2 = torch.randn(27, generator=g)
parameters = [C,W1,b1,W2,b2]

In [8]:
sum(p.nelement() for p in parameters)

11897

In [9]:
for p in parameters:
    p.requires_grad = True

In [10]:
#lre = torch.linspace(-3,0,1000)
#lrs = 10**lre

In [11]:
# collect learning rates as a list for diff values
#initialize vars to track lr
#lri = []
lossi = []
stepi = []

In [12]:
# TRAINING LOOP using minibatches

for i in range(training_loops):
    # minibatch construct
    ix = torch.randint(0,Xtr.shape[0], (batch_size,))
    
    # forward pass
    emb = C[Xtr[ix]] # UPDATED build LUT embedding matrix, (32, num of chars (3), dims of LUT (10) )
    h = torch.tanh(emb.view(-1,block_size * LUT_dim) @ W1 + b1) # (batch_size, hidden_layer_size) hidden layer
    logits = h @ W2 + b2 # (batch_size,27)
    loss = F.cross_entropy(logits,Ytr[ix]) # replaces counts, prob and loss lines above
    #print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None # initialize grad vals to 0
    loss.backward() # populates gradients

    # update parameters
    #lr = lrs[i]
    lr = 0.01
    #lr = 0.1 if i < decay_at else 0.01 # decay after 100,000 steps
    for p in parameters:
        p.data += -lr * p.grad 
        
    # track stats for learning rates
    #lri.append(lre[i])
    stepi.append(i)
    lossi.append(loss.log10().item()) # track log loss 
        
#print(loss.item()) # calculates loss on minibatch

In [13]:
#plt.plot(stepi,lossi)

In [14]:
# calculate loss for training dataset
emb = C[Xtr] 
h = torch.tanh(emb.view(-1,block_size * LUT_dim) @ W1 + b1) # hidden layer
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(2.2118, grad_fn=<NllLossBackward0>)

In [15]:
# calculate loss for dev dataset
emb = C[Xdev] # changed to using Xdev for evaluation
h = torch.tanh(emb.view(-1,block_size * LUT_dim) @ W1 + b1) # hidden layer
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.2363, grad_fn=<NllLossBackward0>)

In [16]:
# calculate loss for test dataset
emb = C[Xte] # changed to using Xdev for evaluation
h = torch.tanh(emb.view(-1,block_size * LUT_dim) @ W1 + b1) # hidden layer
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yte)
loss

tensor(2.2366, grad_fn=<NllLossBackward0>)

In [17]:
# sample from model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size # init iwith all ..
    while True:
        emb = C[torch.tensor([context])] # (1, block_size, d) # dim 1 because we're making one example
        h = torch.tanh(emb.view(1,-1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1) # exponentiate logits to sum to 1
        ix = torch.multinomial(probs,num_samples=1, generator=g).item()
        context = context[1:] + [ix] # shift context window
        out.append(ix) # record context window
        if ix == 0:
            break
            
    print(''.join(itos[i] for i in out))

carpan.
amoriqui.
kemri.
revty.
skandan.
jazhien.
den.
rha.
kaeli.
nellara.
chaiivia.
leigph.
bman.
cateennthonor.
emi.
adbi.
watthon.
jarynix.
kaek.
dura.


Where I left off:
https://youtu.be/P6sfmUTpUmc?t=225

review the decorator `@torch.no_grad() # this decorator disables gradient tracking`