In [12]:
from typing import List
from collections import defaultdict
import math
import time
import random
import os, sys

import numpy as np

In [13]:
import torch
import torch.nn as nn

In [14]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [15]:
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

def convert_to_model_data(dataset: List[List[int]], n_gram:int):
    x, y = list(), list()
    for sentence in dataset:
        sentence_length = len(sentence)
        sentence = [S] * n_gram + sentence + [S]
        for i in range(sentence_length):
            x.append(sentence[i: i+n_gram])
            y.append(sentence[i+n_gram])
    return x, y

In [16]:
context_size = 3

w2i = defaultdict(lambda: len(w2i))
S = w2i["<s>"]
UNK = w2i["<unk>"]

# Training set
train = list(read_dataset("../data/ptb/train.txt"))
random.shuffle(train)
x_train, y_train = convert_to_model_data(train, context_size)
x_train, y_train = torch.tensor(x_train, device=device), torch.tensor(y_train, device=device)

# Dev set
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
x_dev, y_dev = convert_to_model_data(dev, context_size)
x_dev, y_dev = torch.tensor(x_dev, device=device), torch.tensor(y_dev, device=device)



# Feed-forward neural network model

In [17]:
class FNN_LM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, context_size, dropout_prob=0):
        super(FNN_LM, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.context_size = context_size
        self.dropout_prob = dropout_prob
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, device=device)
        self.fnn = nn.Sequential(
            nn.Linear(context_size * embedding_dim, hidden_size, device=device),
            nn.Tanh(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_size, vocab_size, device=device),
        )

    def forward(self, words):
        embeddings = self.embedding(words) # Size: batch_size, context_size, embedding_dim
        embeddings = embeddings.reshape(-1, self.context_size * self.embedding_dim)
        outputs = self.fnn(embeddings) # Size: batch_size, vocab_size
        return outputs

The training is just to see whether the implementation works or not. This is not the right way to train such a model

In [18]:
x_train.shape

torch.Size([887521, 3])

In [19]:
vocab_size = len(w2i)
embedding_dim = 256
hidden_size = 64

In [20]:
model = FNN_LM(vocab_size, embedding_dim, hidden_size, context_size)
loss_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [21]:
1/len(w2i)

0.0001

In [22]:
n_epochs = 10
batch_size = 16*2048

n_batch = int(np.ceil(len(x_train)/batch_size))

for epoch in range(n_epochs):
    train_acc = 0
    for batch in range(n_batch):
        x_batch, y_batch = x_train[batch*batch_size:min((batch+1)*batch_size, len(x_train))], y_train[batch*batch_size:min((batch+1)*batch_size, len(x_train))]
        scores = model(x_batch)
        loss = loss_criterion(scores, y_batch)
        # torch.argmax yields some weird results and don't have time
        # to investigate this now
        predictions = np.argmax(scores.cpu().detach().numpy(), axis=1)
        train_acc = (predictions == y_batch.cpu().detach().numpy()).sum()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Train loss: {loss} - Train accuracy: {train_acc / len(x_train)}")




Train loss: 6.361086368560791 - Train accuracy: 0.0004495668271511322
Train loss: 5.652867317199707 - Train accuracy: 0.0005318184020434446
Train loss: 5.499007701873779 - Train accuracy: 0.0005757610242461868
Train loss: 5.258009433746338 - Train accuracy: 0.0006151967108383914
Train loss: 5.163666248321533 - Train accuracy: 0.0006332244532805421
Train loss: 5.026402950286865 - Train accuracy: 0.0006557591313332304
Train loss: 5.045945644378662 - Train accuracy: 0.0006670264703595746
Train loss: 4.963106632232666 - Train accuracy: 0.0006782938093859188
Train loss: 4.942806720733643 - Train accuracy: 0.0006861809467043596
Train loss: 4.864715576171875 - Train accuracy: 0.0006929413501201661
