In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

torch.manual_seed(1)

<torch._C.Generator at 0x25fe22efa50>

In [4]:
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Antonio\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [64]:
# tokenizer = TweetTokenizer(preserve_case=False)
tok = MWETokenizer([('<', 'unk', '>')], separator = '')

train_data_padded = []
train_data_not_padded = []
vocabulary = []
lengths_of_sentences = []
maximum_sentence_length = 50

with open("data/ptb.train.txt") as f:
    total_sentences = 0
    for i, line in enumerate(f):
        line = line.strip()
        s = word_tokenize(line)
        s = tok.tokenize(s)[:maximum_sentence_length - 2]
        s = ['<sos>'] + s + ['<eos>']
        train_data_not_padded.append(s.copy())
        lengths_of_sentences.append(len(s))
        s.extend(['<pad>']*(maximum_sentence_length - len(s))) 
        total_sentences += 1
        train_data_padded.append(s)
        vocabulary += s

        

In [65]:
longest_sentence_size = [len(x) for x in train_data_padded]
# print((longest_sentence_size))
# print(lengths_of_sentences)
print(train_data_not_padded[0])

vocabulary = sorted(list(set(vocabulary)))
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

print("total sentences:", total_sentences)

['<sos>', 'aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>']
total sentences: 42068


In [82]:
from gensim.models import Word2Vec
embedding_size = 100
word_embeddings = Word2Vec(train_data_not_padded, min_count=1, size=embedding_size, window=5)
word_embeddings.train(train_data_not_padded, len(train_data_not_padded), epochs=100, 
                      total_examples=word_embeddings.corpus_count)

(66651102, 96960000)

In [70]:
# word_embeddings.wv.most_similar("credit")
# word_embeddings.wv['credit']

[('loans', 0.517526388168335),
 ('loan', 0.4132787585258484),
 ('financial', 0.4076409935951233),
 ('debt', 0.4062206745147705),
 ('less-developed', 0.4037669599056244),
 ('savings', 0.3898395299911499),
 ('write-downs', 0.3844359517097473),
 ('payment', 0.38194894790649414),
 ('capital', 0.38089674711227417),
 ('financing', 0.3681441843509674)]

In [83]:
vocabulary_size = len(vocabulary) - 1 # - 1 for <'pad'>
input_size = 100
output_size = 100
hidden_size = 10

In [76]:
# make the word embeddings into a pythorch tensor
weights = torch.FloatTensor(word_embeddings.wv.vectors)

['<sos>', 'aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>']


9968

In [58]:
class rnn_language_model(nn.Module):
    def __init__(self, vocabulary_size, input_size, hidden_size, num_layers, weights):
        super(rnn_language_model, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocabulary_size)
        
    def forward(self, x, h):
        x = self.embed(torch.tensor(x, dtype=torch.long)).view(1, len(x), 100)
        out, (h, c) = self.lstm(x, h)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        out = self.linear(out)
        return out, (h, c)

In [19]:
learning_rate = 0.0001
num_layers = 1
batch_size = 100
epochs = 100

model = rnn_language_model(vocabulary_size, input_size, hidden_size, num_layers, weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Fixed batch size - this should probably be changed later since the size will vary from sentence to sentence, not sure how...
def get_batch(data, batch_size):
    batches = []
    count = 0
    current_batch = []
    for i in range(len(data)):
        for j in range(len(data[i])):
            if count == batch_size + 1:
                batches.append(current_batch)
                current_batch = []
                count = 0
            else:
                current_batch.append(data[i][j])
                count += 1
    return batches  

batches = get_batch(train_data, batch_size)
print(len(batches))

9538


In [110]:
# constructing a batch for the forward pass
# print([word_embeddings.wv.vocab[x].index for x in train_data_padded[:2]])
for i in range(len(train_data_padded))
temp = torch.tensor([word_embeddings.wv.vocab[x].index for x in train_data_not_padded[0]])
#                      for i in range(len(train_data_not_padded))])
print(temp)
temp = torch.tensor([word_embeddings.wv.vocab[x].index for x in train_data_not_padded[i] for i in range(len(train_data_not_padded))])
X = pack_padded_sequence(temp,
                         lengths_of_sentences, batch_first=True)

tensor([   2, 9967, 9968, 9969, 9970, 9971, 9972, 9973, 9974, 9975, 9976, 9977,
        9978, 9979, 9980, 9981, 9982, 9983, 9984, 9985, 9986, 9987, 9988, 9989,
        9990,    3])


RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [84]:
# Train
for epoch in range(epochs):
    (hidden, cell) = (torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(num_layers, batch_size, hidden_size))
    for i in range(len(batches)):
        inputs = [word_embeddings.wv.vocab[x].index for x in batches[i][:-1] if x is not '<pad>' else np.zeros(embedding_size)]
        targets = torch.tensor([word_embeddings.wv.vocab[x].index for x in batches[i][1:] if x is not '<pad>' else np.zeros(embedding_size)], dtype=torch.long)

        hidden = hidden.detach()
        cell = cell.detach()
    
        outputs, (hidden, cell) = model(inputs, (hidden, cell))
        loss = nn.CrossEntropyLoss()(outputs, targets)

        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

    if epoch % 1 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
               .format(epoch, epochs, loss.item(), np.exp(loss.item())))


SyntaxError: invalid syntax (<ipython-input-84-d1316243506b>, line 5)

In [57]:
# intialise first hidden state
(hidden, cell) = (torch.zeros(1, 1, hidden_size), torch.zeros(1, 1, hidden_size))

for idx in input_idx:
    print("input word:", word_embeddings.wv.index2word[idx])
    output = embedded(torch.tensor([idx], dtype=torch.long)).view(1, 1, 100)
    output, (hidden, cell) = lstm(output, (hidden, cell))
    output = softmax(out(output[0]))
    predicted_idx = np.argmax(output.detach().numpy())
    print("predicted_class: {0}".format(word_embeddings.wv.index2word[predicted_idx]))

input word: <unk>
predicted_class: <sos>
input word: with
predicted_class: at
input word: the
predicted_class: were
input word: $
predicted_class: do
input word: chairman
predicted_class: shares
input word: N
predicted_class: its
input word: the
predicted_class: were
input word: rewards
predicted_class: its
input word: <pad>
predicted_class: shares
input word: indicators
predicted_class: co.
input word: chips
predicted_class: its
input word: co.
predicted_class: shares
input word: <sos>
predicted_class: &
