In [1]:
from collections import defaultdict

import numpy as np
import torch
from torch import nn, optim

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

In this notebook, I'm going to build a simple language model that will predict the upcoming word using n-grams.

In [4]:
w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<unk>"]

def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            yield [w2i[x] for x in line.strip().split(" ")]

In [5]:
train = list(read_dataset("../data/ptb/train.txt"))

w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/ptb/valid.txt"))
i2w = {v:k for k,v in w2i.items()}

In [6]:
nwords = len(w2i)
print(nwords)

9999


# NGram Language Model

In [79]:
class NGramLanguageModel(nn.Module):
    def __init__(self, N_grams, vocab_size, embedding_dim, device):
        super(NGramLanguageModel, self).__init__()
        self.N_grams = N_grams
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.device = device

        self.word_embeddings = nn.ModuleList([nn.Embedding(vocab_size, embedding_dim, device=self.device) for _ in range(N_grams)])
        self.bias = nn.Parameter(torch.zeros((1, embedding_dim), device=self.device), requires_grad=True)
    
    def forward(self, words):
        out_emb_list = [self.word_embeddings[i](words[i]) for i in range(self.N_grams)] # List of N_grams elements, each element is of size embedding_dim
        out_emb = torch.stack(out_emb_list)
        n_gram_emb = out_emb.sum(dim=0, keepdims=True)
        out = n_gram_emb + self.bias
        return out



In [84]:
# N-grams
N = 2
embedding_dim = 10

model = NGramLanguageModel(N_grams=N, vocab_size=len(w2i), embedding_dim=embedding_dim, device=device)
optimizer = optim.Adam(model.parameters(),lr=0.1)
loss_criterion = nn.CrossEntropyLoss()


# Model training and eval [WIP]