In [7]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from typing import List, Tuple
%matplotlib inline

In [8]:
words = open("names.txt").read().splitlines()

In [9]:
len(words)

32033

In [10]:
chars = sorted(list(set(''.join(words))))
# encoding and decoding chars
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(stoi)

In [11]:
# hyperparameters
learning_rate = 0.01
max_epochs = 200000
batch_size = 32
block_size = 3 

# build dataset
def build_dataset (words: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    return torch.tensor(X), torch.tensor(Y)

import random
random.seed(42)
random.shuffle(words)

# make training, validation and test splits
n1, n2 = int(len(words) * 0.8), int(len(words) * 0.9)
X_train, Y_train = build_dataset(words[:n1])
X_valid, Y_valid = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

In [23]:
from typing import List
from torch.nn.parameter import Parameter


class Linear:
    def __init__ (self, fan_in: int, fan_out: int, bias: bool = True) -> None:
        self.weight = torch.randn(fan_in, fan_out, generator=g) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def forward (self, X: torch.Tensor) -> torch.Tensor:
        
        self.out =  X @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def __call__(self, X: torch.Tensor) -> torch.Tensor:
        return self.forward(X)

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    

class BatchNorm1d:
    def __init__ (self, dim: int, eps: float = 1e-5, momentum: float = 0.1) -> None:
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters trained with backprop
        self.gamma = torch.ones(dim) # scale
        self.beta = torch.zeros(dim) # bias
        # buffers (trained with a running momentum update)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def forward (self, X: torch.Tensor) -> torch.Tensor:
        if self.training:
            xmean = X.mean(dim=0)
            xvar = X.var(dim=0)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        
        xhat = (X - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta # scale and shift

        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var - (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def __call__(self, X: torch.Tensor) -> torch.Tensor:
        return self.forward(X)
        

    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, X: torch.Tensor) -> torch.Tensor:
        self.out = torch.tanh(X)
        return self.out
    def parameters(self):
        return []

n_embd = 10
n_hidden = 100
g = torch.Generator().manual_seed(42)

token_embedding_table = torch.randn((vocab_size, n_embd), generator=g)
layers = [
  Linear(n_embd * block_size, n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden), Tanh(),
  Linear(           n_hidden, vocab_size),
]

with torch.no_grad():
    # make last layer less confident
    layers[-1].weight *= 0.1
    for layer in layers:
        if isinstance(layer, Linear):
            layer.weight *= 5/3 # tanh gain

parameters = [token_embedding_table] + [p for l in layers for p in l.parameters()]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

46497


In [24]:
lossi = []
for i in range(max_epochs):
    # construct minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size,))
    Xb, Yb = X_train[ix], Y_train[ix]

    # forward pass
    tok_emb = token_embedding_table[Xb] # T x block_size x C
    x = tok_emb.view(tok_emb.shape[0], -1) # T x (block_size * C)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)

    # backward pass
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters: # zero out the gradients
        p.grad = None
    loss.backward()

    # update parameters
    lr = learning_rate if i < max_epochs / 2 else learning_rate / 10
    for p in parameters:
        p.data += -lr * p.grad
    
    if i % 10000 == 0:
        print(f'Step {i:7d} Loss: {loss:.4f}')
    lossi.append(loss.log10().item())


Step       0 Loss: 3.2884
Step   10000 Loss: 2.2360
Step   20000 Loss: 2.4046
Step   30000 Loss: 1.8028
Step   40000 Loss: 2.0053
Step   50000 Loss: 2.0628
Step   60000 Loss: 2.3371
Step   70000 Loss: 2.7050
Step   80000 Loss: 2.3509
Step   90000 Loss: 2.4679
Step  100000 Loss: 1.8734
Step  110000 Loss: 2.2973
Step  120000 Loss: 1.7442
Step  130000 Loss: 1.9195
Step  140000 Loss: 2.1413
Step  150000 Loss: 2.0007
Step  160000 Loss: 2.2581
Step  170000 Loss: 2.0947
Step  180000 Loss: 1.8420
Step  190000 Loss: 2.1369


In [25]:
@torch.no_grad()
def split_loss (split: str) -> float:
    x, y = {
        'train': (X_train, Y_train),
        'valid': (X_valid, Y_valid),
        'test': (X_test, Y_test)
    }[split]

    # evaluate loss on validation set
    tok_emb = token_embedding_table[x] # T x block_size x C
    x = tok_emb.view(tok_emb.shape[0], -1)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, y) 
    return split, loss.item()

for layer in layers:
    layer.training = False
print (split_loss('train'))
print (split_loss('valid'))

('train', 2.0376107692718506)
('valid', 2.101020574569702)


In [26]:
print (split_loss('test'))

('test', 2.0978434085845947)


In [28]:
# generate some names
g = torch.Generator().manual_seed(42)
for i in range(20):
    out = []
    context = [0] * block_size
    while True:
        tok_emb = token_embedding_table[torch.tensor([context])]
        x = tok_emb.view(tok_emb.shape[0], -1)
        for layer in layers:
            x = layer(x)

        probs = F.softmax(x, dim=1)
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix) 
        if itos[ix] == '.':
            break
    print(''.join(itos[i] for i in out))

anueden.
tia.
mari.
nehante.
naziel.
yana.
kemarce.
man.
epiah.
nasilmazi.
kend.
josey.
gor.
lor.
mari.
yana.
cvisia.
acen.
kaithan.
tiyan.
