In [51]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.dataset import random_split

g = 2147483647

class MLPNgramClassifier(torch.nn.Module):
    def __init__(self, seed=42, block_size=3, embedding_size=2, hidden_size=100, device='cpu'):
        super().__init__()
        self.block_size = block_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.device = device
        self.g = torch.Generator().manual_seed(seed)
        self.C = torch.nn.Parameter(torch.randn((27, embedding_size), generator=self.g, device=device, requires_grad=True))
        self.W1 = torch.nn.Parameter(torch.randn(((block_size * embedding_size), hidden_size), generator=self.g, device=device, requires_grad=True))
        self.b1 = torch.nn.Parameter(torch.randn(hidden_size, generator=self.g, device=device, requires_grad=True))
        self.W2 = torch.nn.Parameter(torch.randn((hidden_size, 27), generator=self.g, device=device, requires_grad=True))
        self.b2 = torch.nn.Parameter(torch.randn(27, generator=self.g, device=device, requires_grad=True))

    def forward(self, x):
        emb = self.C[x]
        h = torch.tanh(emb.view(-1, (self.block_size * self.embedding_size)) @ self.W1 + self.b1) 
        logits = h @ self.W2 + self.b2
        return logits

words = open('names.txt').read().splitlines()

# build the vocabulary
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# build the dataset
block_size = 3
X, Y = [], []

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append
device = torch.device('cpu')
X = torch.as_tensor(X).to(device)
Y = torch.as_tensor(Y).to(device)


In [52]:
dataset = TensorDataset(X, Y)
train_ratio = .8
validation_ratio = .1

n_total = len(dataset)
n_train = int(n_total * train_ratio)
n_train_batch=32
n_validation = int(n_total * validation_ratio)
n_validation_batch=32
n_test = n_total - n_train - n_validation

train_data, validation_data, test_data = random_split(dataset, [n_train, n_validation, n_test])

train_loader = DataLoader(train_data, batch_size=n_train_batch)
validation_loader = DataLoader(validation_data, batch_size=n_validation_batch)
test_loader = DataLoader(test_data, batch_size=n_test)

In [7]:
lr = .01
embedding_size = 2
hidden_size = 100
model = MLPNgramClassifier(g, block_size, embedding_size, hidden_size, device)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
loss_fn = nn.functional.cross_entropy

In [73]:
%run StepByStep.py

In [11]:
sbs = StepByStep(model, loss_fn, optimizer)
sbs.set_seed(g)
sbs.to(device)

sbs.set_loaders(train_loader, validation_loader, test_loader)

sbs.set_tensorboard('mlp-ngram')

In [74]:
current_best_loss = 100
for c_dim in range(2, 20):
    for w_dim in range(100,1000, 50):
        lr = .1
        embedding_size = c_dim
        hidden_size = w_dim
        model = MLPNgramClassifier(g, block_size, embedding_size, hidden_size, device)
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.5)
        reduced_optimizer = optim.SGD(model.parameters(), lr=lr/10, momentum=0.5)
        further_reduced_optimizer = optim.SGD(model.parameters(), lr=lr/100, momentum=0.5)
        loss_fn = nn.functional.cross_entropy

        sbs = StepByStep(model, loss_fn, optimizer)
        sbs.set_seed(g)
        sbs.to(device)

        sbs.set_loaders(train_loader, validation_loader, test_loader)

        sbs.set_tensorboard('mlp-ngram')

        sbs.train(10)
        sbs.set_optimizer(reduced_optimizer)
        sbs.train(5)
        sbs.set_optimizer(further_reduced_optimizer)
        sbs.train(5)
        sbs.train_validation(1)
        # compare sbs last losses loss with current_best_loss
        if sbs.losses[-1] < current_best_loss:
            current_best_loss = sbs.losses[-1]
            print("Current best loss: ", current_best_loss)
            print("Current best embedding size: ", embedding_size)
            print("Current best hidden size: ", hidden_size)
            print("Current best learning rate: ", lr)
            print("Current validation loss: ", sbs.val_losses[-1])
            print("[][][][][][][][][][][]")
            
        
    

Current best loss:  2.239733041691245
Current best embedding size:  2
Current best hidden size:  100
Current best learning rate:  0.1
Current validation loss:  2.239867694414682
[][][][][][][][][][][]
Current best loss:  2.232458601555517
Current best embedding size:  2
Current best hidden size:  150
Current best learning rate:  0.1
Current validation loss:  2.231543315409945
[][][][][][][][][][][]
Current best loss:  2.231534537058559
Current best embedding size:  2
Current best hidden size:  250
Current best learning rate:  0.1
Current validation loss:  2.231130409708866
[][][][][][][][][][][]
Current best loss:  2.226823933077728
Current best embedding size:  2
Current best hidden size:  300
Current best learning rate:  0.1
Current validation loss:  2.2267742929418493
[][][][][][][][][][][]
Current best loss:  2.223254302628913
Current best embedding size:  2
Current best hidden size:  400
Current best learning rate:  0.1
Current validation loss:  2.2223362894245486
[][][][][][][][]

KeyboardInterrupt: 