Code adapted from https://abhinavcreed13.github.io/blog/bengio-trigram-nplm-using-pytorch/

In [19]:
#load wikitext data

train = []
test = []

train_file_path = '../newtraincorpus.txt'
test_file_path = '../newtestcorpus.txt'

with open(train_file_path, 'r', encoding='utf-8') as file:
    for line in file:
            # Process each line as a sentence
            words = (line.strip().split())
            train.append(words)

with open(test_file_path, 'r', encoding='utf-8') as file:
    for line in file:
            # Process each line as a sentence
            words = (line.strip().split())
            test.append(words)

print(len(train))
print(len(test))

77363
9418


In [20]:
train[0]

['valkyria',
 'of',
 'the',
 'battlefield',
 '<num>',
 'commonly',
 'referred',
 'to',
 'as',
 'valkyria',
 'chronicles',
 'iii',
 'outside',
 'japan',
 'is',
 'a',
 'tactical',
 'role',
 '@-@',
 'playing',
 'video',
 'game',
 'developed',
 'by',
 'sega',
 'and',
 'media']

In [21]:
#create vocabulary


UNK_symbol = "<UNK>"
vocab = set([UNK_symbol])



# create term frequency of the words
words_term_frequency_train = {}
for doc in train:
    for word in doc:
        # this will calculate term frequency
        # since we are taking all words now
        words_term_frequency_train[word] = words_term_frequency_train.get(word,0) + 1

# create vocabulary
for doc in train:
    for word in doc:
        if words_term_frequency_train.get(word,0) >= 5:
            vocab.add(word)

# remove "@-@" from vocab
vocab.remove("@-@")

print(len(vocab))

19114


In [22]:
#create trigrams

import numpy as np
# create required lists
x_train = []
y_train = []

x_test = []
y_test = []


# create word to id mappings
word_to_id_mappings = {}
for idx,word in enumerate(vocab):
    word_to_id_mappings[word] = idx

# function to get id for a given word
# return <UNK> id if not found
def get_id_of_word(word):
    unknown_word_id = word_to_id_mappings['<UNK>']
    return word_to_id_mappings.get(word,unknown_word_id)

# creating training and dev set
for sentence in train:
    for i,word in enumerate(sentence):
        if i+2 >= len(sentence):
            # sentence boundary reached
            # ignoring sentence less than 3 words
            break
        # convert word to id
        x_extract = [get_id_of_word(word),get_id_of_word(sentence[i+1])]
        y_extract = [get_id_of_word(sentence[i+2])]

        x_train.append(x_extract)
        y_train.append(y_extract)

for sentence in test:
    for i,word in enumerate(sentence):
        if i+2 >= len(sentence):
            # sentence boundary reached
            # ignoring sentence less than 3 words
            break
        # convert word to id
        x_extract = [get_id_of_word(word),get_id_of_word(sentence[i+1])]
        y_extract = [get_id_of_word(sentence[i+2])]

        x_test.append(x_extract)
        y_test.append(y_extract)
  

# making numpy arrays
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)  
  
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1480510, 2)
(1480510, 1)
(174126, 2)
(174126, 1)


In [24]:
# load libraries
import torch
import multiprocessing
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import time

In [25]:
# Trigram Neural Network Model
class TrigramNNmodel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(TrigramNNmodel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of x1 and x2 embeddings
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = F.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

In [30]:
# create parameters
gpu = 0 
# word vectors size
EMBEDDING_DIM = 200
CONTEXT_SIZE = 2
BATCH_SIZE = 256
# hidden units
H = 100
torch.manual_seed(13013)

# check if gpu is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
available_workers = multiprocessing.cpu_count()

print("--- Creating training and dev dataloaders with {} batch size ---".format(BATCH_SIZE))
train_set = np.concatenate((x_train, y_train), axis=1)
test_set = np.concatenate((x_test, y_test), axis=1)
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, num_workers = available_workers)
test_loader = DataLoader(test_set, batch_size = BATCH_SIZE, num_workers = available_workers)

cuda
--- Creating training and dev dataloaders with 256 batch size ---


In [34]:
# helper function to get accuracy from log probabilities
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc

# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:2].type(torch.LongTensor)
            target_tensor = data_tensor[:,2].type(torch.LongTensor)
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count

In [29]:
for it, data_tensor in enumerate(train_loader):       
    print(data_tensor[:,0:2])
    print(data_tensor[:,2])
    break

tensor([[ 5552, 13071],
        [13071, 17778],
        [17778, 18091],
        [18091, 12167],
        [12167,  3047],
        [ 3047,  2101],
        [ 2101,  5533],
        [ 5533,  2148],
        [ 2148,  5552],
        [ 5552,   590],
        [  590, 18441],
        [18441,  1888],
        [ 1888, 13535],
        [13535,   102],
        [  102,  2322],
        [ 2322, 12841],
        [12841, 18311],
        [18311,  5328],
        [ 5328,  5491],
        [ 5491,  9542],
        [ 9542,  2586],
        [ 2586,  1855],
        [ 1855,  8556],
        [ 8556, 12548],
        [12548,  3402],
        [17672,  7354],
        [ 7354, 17778],
        [17778, 16050],
        [ 1617, 15790],
        [15790, 12055],
        [12055,  5342],
        [ 5342, 15790],
        [15790, 13535],
        [13535,  2940],
        [ 2940,   102],
        [  102, 17778],
        [17778,  9356],
        [ 9356,  2586],
        [ 2586, 15790],
        [15790, 17778],
        [17778,  5552],
        [ 5267, 

In [35]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
model = TrigramNNmodel(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE, H)

# load it to gpu
model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)


# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_model_path = None
for epoch in range(5):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(train_loader):       
        context_tensor = data_tensor[:,0:2].type(torch.LongTensor)
        target_tensor = data_tensor[:,2].type(torch.LongTensor)

        context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, test_loader, gpu)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(epoch, dev_acc, dev_loss))
    if dev_acc > best_acc:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
        best_acc = dev_acc
        # set best model path
        best_model_path = 'best_model_{}.dat'.format(epoch)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 9.886507034301758; Acc:0.0; Time taken (s): 21.88786554336548
Training Iteration 500 of epoch 0 complete. Loss: 6.325962543487549; Acc:0.0859375; Time taken (s): 3.190002202987671
Training Iteration 1000 of epoch 0 complete. Loss: 6.187276840209961; Acc:0.203125; Time taken (s): 2.7600018978118896
Training Iteration 1500 of epoch 0 complete. Loss: 6.743995189666748; Acc:0.18359375; Time taken (s): 2.3860015869140625
Training Iteration 2000 of epoch 0 complete. Loss: 6.36767578125; Acc:0.15234375; Time taken (s): 2.3800017833709717
Training Iteration 2500 of epoch 0 complete. Loss: 6.237010478973389; Acc:0.16015625; Time taken (s): 2.3690013885498047
Training Iteration 3000 of epoch 0 complete. Loss: 5.213693618774414; Acc:0.1875; Time taken (s): 2.3780016899108887
Training Iteration 3500 of epoch 0 complete. Loss: 6.578780651092529; Acc:0.13671875; Time taken (s): 2.3860015869140625
Training Iteration 4000