<a href="https://colab.research.google.com/github/gokulanv/NLP_Papers/blob/master/NNLM/neuralProbLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# code reference from @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

dtype = torch.FloatTensor

sentences = [ "i studied a book", "i read a newspaper", "i hate a phone", "I love a computer"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # number of Vocabulary

# NNLM Parameter
n_step = 3 # n-1 in paper
n_hidden = 2 # h in paper
m = 2 # m in paper


In [0]:
def make_batch(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word_dict[n] for n in word[:-1]]
        target = word_dict[word[-1]]

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

# Model
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
        self.b = nn.Parameter(torch.randn(n_class).type(dtype))

    def forward(self, X):
      X = self.C(X)
      X = X.view(-1, n_step * m)
      hidden_layer_activation = torch.tanh(self.d + torch.mm(X, self.H))
      output = self.b + torch.mm(X, self.W) + torch.mm(hidden_layer_activation, self.U)
      return output

In [15]:

model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

print("Training....")
# Training
for epoch in range(10000):
    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

print("Evaluating...")
# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])


Training....
Epoch: 1000 cost = 0.140552
Epoch: 2000 cost = 0.027548
Epoch: 3000 cost = 0.009706
Epoch: 4000 cost = 0.004267
Epoch: 5000 cost = 0.002090
Epoch: 6000 cost = 0.001085
Epoch: 7000 cost = 0.000583
Epoch: 8000 cost = 0.000320
Epoch: 9000 cost = 0.000179
Epoch: 10000 cost = 0.000101
Evaluating...
[['i', 'studied'], ['i', 'read'], ['i', 'hate'], ['I', 'love']] -> ['book', 'newspaper', 'phone', 'computer']


## RELU Comparison

In [0]:
# Model
class NNLMRelu(nn.Module):
    def __init__(self):
        super(NNLMRelu, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
        self.b = nn.Parameter(torch.randn(n_class).type(dtype))

    def forward(self, X):
      X = self.C(X)
      X = X.view(-1, n_step * m)
      hidden_layer_activation = torch.relu(self.d + torch.mm(X, self.H))
      output = self.b + torch.mm(X, self.W) + torch.mm(hidden_layer_activation, self.U)
      return output

In [17]:

model = NNLMRelu()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))

print("Training....")
# Training
for epoch in range(10000):
    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1)%1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]

print("Evaluating...")
# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])


Training....
Epoch: 1000 cost = 0.197540
Epoch: 2000 cost = 0.031416
Epoch: 3000 cost = 0.010084
Epoch: 4000 cost = 0.004223
Epoch: 5000 cost = 0.001997
Epoch: 6000 cost = 0.001008
Epoch: 7000 cost = 0.000543
Epoch: 8000 cost = 0.000300
Epoch: 9000 cost = 0.000168
Epoch: 10000 cost = 0.000094
Evaluating...
[['i', 'studied'], ['i', 'read'], ['i', 'hate'], ['I', 'love']] -> ['book', 'newspaper', 'phone', 'computer']
