Based on "A Neural Probabilistic Language Model" by Bengio et al. (2003)

In [83]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt

In [84]:
with open('data/names.txt', 'r') as f:
    words = [word.strip().lower() + '.' for word in f.readlines()]

train, valid, test = 0.8, 0.1, 0.1
train_words = words[:int(len(words) * train)]
valid_words = words[int(len(words) * train):int(len(words) * (train + valid))]
test_words = words[int(len(words) * (train + valid)):]

In [73]:
class WordDataSet(Dataset):
    def __init__(self, words, blck_size):
        self.words = words
        self.blck_size = blck_size

        # Create a dictionary that maps char to integers, and vice versa
        self.chars = sorted(list(set(''.join(self.words))))
        self.char_to_int = {c: i + 1 for i, c in enumerate(self.chars)}
        self.char_to_int['.'] = 0
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}

        X, y = [], []
        for w in self.words:
            context = [0] * blck_size
            for i in range(len(w)):
                idx = self.char_to_int[w[i]]
                X.append(context)
                y.append(idx)
                context = context[1:] + [idx]

        self.X = torch.tensor(X)
        self.y = torch.tensor(y)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]    

In [80]:
blck_size = 3
train_ds = WordDataSet(train_words, blck_size)
valid_ds = WordDataSet(valid_words, blck_size)
test_ds = WordDataSet(test_words, blck_size)

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

In [81]:
class NNLM(torch.nn.Module):
    # Neural Network Language Model
    def __init__(self, emb_size, hidden_size, vocab_size, blck_size):
        super(NNLM, self).__init__()
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.blck_size = blck_size

        self.emb = torch.nn.Parameter(torch.randn(vocab_size, emb_size))
        self.W1 = torch.nn.Parameter(torch.randn(emb_size * blck_size, hidden_size))
        self.b1 = torch.nn.Parameter(torch.randn(hidden_size))
        self.W2 = torch.nn.Parameter(torch.randn(hidden_size, vocab_size))
        self.b2 = torch.nn.Parameter(torch.randn(vocab_size))

    def forward(self, x):
        x = self.emb[x]
        x = x.view(-1, self.emb_size * self.blck_size)
        x = torch.tanh(torch.matmul(x, self.W1) + self.b1)
        x = torch.matmul(x, self.W2) + self.b2
        return x

Embedding size > 32 doesn't make sense, because the alphabet size is only 26.

In [82]:
model = NNLM(32, 200, len(train_ds.chars) + 1, blck_size)
print(f'Number of parameters: {sum([p.numel() for p in model.parameters()])}')

num_epochs = 100
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_loss, valid_loss = [], []
for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    for x, y in train_dl:
        optimizer.zero_grad()
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss.append(running_loss / len(train_dl))

    model.eval()
    running_loss = 0
    for x, y in valid_dl:
        y_hat = model(x)
        loss = criterion(y_hat, y)
        running_loss += loss.item()
    valid_loss.append(running_loss / len(valid_dl))

    print(f'Epoch: {epoch + 1}/{num_epochs} | Train loss: {train_loss[-1]:.4f} | Valid loss: {valid_loss[-1]:.4f}')

Number of parameters: 25924
Epoch: 1/100 | Train loss: 7.0049 | Valid loss: 5.2712
Epoch: 2/100 | Train loss: 3.7116 | Valid loss: 4.2324
Epoch: 3/100 | Train loss: 3.1267 | Valid loss: 3.7745
Epoch: 4/100 | Train loss: 2.8431 | Valid loss: 3.4917
Epoch: 5/100 | Train loss: 2.6661 | Valid loss: 3.2986
Epoch: 6/100 | Train loss: 2.5469 | Valid loss: 3.1518
Epoch: 7/100 | Train loss: 2.4584 | Valid loss: 3.0523
Epoch: 8/100 | Train loss: 2.3907 | Valid loss: 2.9591
Epoch: 9/100 | Train loss: 2.3379 | Valid loss: 2.8861
Epoch: 10/100 | Train loss: 2.2937 | Valid loss: 2.8167
Epoch: 11/100 | Train loss: 2.2584 | Valid loss: 2.7544
Epoch: 12/100 | Train loss: 2.2287 | Valid loss: 2.7148
Epoch: 13/100 | Train loss: 2.2032 | Valid loss: 2.6903
Epoch: 14/100 | Train loss: 2.1820 | Valid loss: 2.6507
Epoch: 15/100 | Train loss: 2.1633 | Valid loss: 2.6248
Epoch: 16/100 | Train loss: 2.1480 | Valid loss: 2.6088
Epoch: 17/100 | Train loss: 2.1359 | Valid loss: 2.5732
Epoch: 18/100 | Train loss: 2

In [162]:
# sampling
max_length = 20
num_words = 10

model.eval()
for _ in range(num_words):
    context = [0] * blck_size
    generated_word = []
    for _ in range(max_length):
        x = torch.tensor([context])
        y_hat = model(x)
        # output has shape (1, vocab_size), squeeze to (vocab_size)
        probs = F.softmax(y_hat, dim=1).squeeze()
        sampled_char_idx = torch.multinomial(probs, 1).item()
        if sampled_char_idx == 0:
            break
        generated_word.append(train_ds.int_to_char[sampled_char_idx])
        context = context[1:] + [sampled_char_idx]
    print(''.join(generated_word))

ellenzi
aysha
belles
ronix
larenley
gracerem
cora
jiai
haden
mace
