In [10]:
import json
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer

from gensim.models import Word2Vec

In [3]:
torch.manual_seed(1)

<torch._C.Generator at 0x10ea29f10>

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eliisabethein/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load data

In [12]:
def load_data(filename, max_sentence_len=50):
    # the tokenizer splits <unk> so we use MWETokenizer to re-merge it
    merger = MWETokenizer([('<', 'unk', '>')], separator = '') 
    data_original = []
    data_padded = []
    with open(filename) as f:
        for line in f:
            sentence = word_tokenize(line.strip())       # tokenize sentence
            sentence = merger.tokenize(sentence)         # merge <unk>
            sentence = sentence[:max_sentence_len - 2]   # cut sentence at max_sentence_length
            sentence = ['<sos>'] + sentence + ['<eos>']  # add start and end-of-sentence tags
            
            # pad the rest of the sentence
            padded_sentence = sentence.copy()
            padded_sentence.extend(['<pad>']*(max_sentence_len - len(sentence))) 
            
            data_original.append(sentence)
            data_padded.append(padded_sentence)
    
    return data_original, data_padded

In [13]:
train_data, train_data_padded = load_data("data/ptb.train.txt")
val_data, val_data_padded = load_data("data/ptb.valid.txt")
test_data, test_data_padded = load_data("data/ptb.test.txt")

### Create Word2Vec word embeddings

In [19]:
embedding_size = 100
num_epochs_to_train = 100

In [20]:
word2vec_model = Word2Vec(train_data, min_count=1, size=embedding_size, window=5)
word2vec_model.train(train_data, epochs=num_epochs_to_train, total_examples=word_embeddings.corpus_count)
# word_embeddings.train(train_data, len(train_data), epochs=100, total_examples=word_embeddings.corpus_count)

(66649761, 96960000)

In [23]:
word2vec_model.wv.most_similar("credit")
# word2vec_model.wv['credit']

[('debt', 0.48688817024230957),
 ('loans', 0.47244635224342346),
 ('coverage', 0.39948761463165283),
 ('loan-loss', 0.3886515200138092),
 ('capital', 0.38623660802841187),
 ('less-developed', 0.3841148018836975),
 ('loan', 0.3809422254562378),
 ('savings', 0.3809009790420532),
 ('credit-card', 0.37522560358047485),
 ('reserves', 0.37364476919174194)]

In [21]:
vocabulary_size = len(word2vec_model.wv.vocab)
print("size of the vocabulary:", vocabulary_size)

size of the vocabulary: 10005


### Split data into batches

In [55]:
# OLD FUNCTION
# Fixed batch size - this should probably be changed later since the size will vary from sentence to sentence, not sure how...
def get_batch(data, data_padded, batch_size):
    batches = []
    count = 0
    current_batch = []
    for i in range(len(data)):
        for j in range(len(data[i])):
            if count == batch_size + 1:
                batches.append(current_batch)
                current_batch = []
                count = 0
            else:
                current_batch.append(data[i][j])
                count += 1
    return batches

In [155]:
# NEW FUNCTION
def get_batches(data, data_padded, batch_size, pad_index, word2vec_model):
    inputs = []
    targets = []
    lengths = []
    for i in range(len(data) // batch_size):
        # take batch_size sentences from the data each time
        batch_sentences = data[i*batch_size:(i+1)*batch_size]
        batch_sentence_lens = [len(x) for x in batch_sentences]
        
        # sentences in a batch have to be sorted in decreasing order of length (for pack_padded_sentence)
        sorted_pairs = sorted(zip(batch_sentence_lens,batch_sentences), reverse=True)
        batch_sentences = [sentence for length, sentence in sorted_pairs]
        batch_sentence_lens = [length-1 for length, sentence in sorted_pairs]
        
        # each input and target is a (batch_size x max_sentence_len-1 x 1) matrix
        # initially filled with the index for padding tag <pad>
        input_batch = np.ones((batch_size, len(data_padded[0])-1, 1)) * pad_index
        target_batch = np.ones((batch_size, len(data_padded[0])-1, 1)) * pad_index
        
        # for each sentence in the batch, fill the corresponding row in current_batch
        # with the indexed of the words in the sentence (except for <pad>)
        for j, sentence in enumerate(batch_sentences):
            word_indexes = np.array([word2vec_model.wv.vocab[word].index for word in sentence])
            input_batch[j,0:len(sentence)-1,0] = word_indexes[:-1]
            target_batch[j,0:len(sentence)-1,0] = word_indexes[1:]
        
        # make the matrices into torch tensors and append
#         inputs.append(torch.tensor(input_batch))
#         targets.append(torch.tensor(target_batch))
        inputs.append(input_batch)
        targets.append(target_batch)
        lengths.append(batch_sentence_lens)
    return inputs, targets, lengths

In [156]:
batch_size = 100
padding_index = vocabulary_size
train_batches, train_targets, train_sentence_lens = get_batches(train_data, train_data_padded, 
                                                                batch_size, padding_index, word2vec_model)

### Define RNNLM

In [236]:
class rnn_language_model(nn.Module):
    def __init__(self, vocabulary_size, embedding_size, hidden_size, num_layers, embedding_weights):
        super(rnn_language_model, self).__init__()
        self.embed = nn.Embedding.from_pretrained(embedding_weights)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocabulary_size)
        
    def forward(self, x, h):
        batch_size, max_len, _ = x.shape
        embedding_dim = 100
        
        x = self.embed(torch.tensor(x, dtype=torch.long)).view(batch_size, max_len, embedding_dim)
#         x = pack_padded_sequence(x, x_lens, batch_first=True) # CURRENTLY DOESN'T WORK
        
        out, (h, c) = self.lstm(x, h)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        out = self.linear(out)
        return out, (h, c)

### Train RNNLM

In [234]:
input_size = 100
output_size = 100
hidden_size = 10

In [235]:
# make the word embeddings into a pythorch tensor
embedding_weights = word2vec_model.wv.vectors
embedding_weights = np.vstack((embedding_weights, np.zeros((1,embedding_size))))  # add zero vector for <pad>
embedding_weights = torch.tensor(embedding_weights)

In [237]:
learning_rate = 0.0001
num_layers = 1
epochs = 100

model = rnn_language_model(vocabulary_size, input_size, hidden_size, num_layers, embedding_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [238]:
for epoch in range(epochs):
    (hidden, cell) = (torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(num_layers, batch_size, hidden_size))
    for i in range(len(train_batches)):
        x = train_batches[i]
        x_lens = train_sentence_lens[i]
        y = train_targets[i]

        hidden = hidden.detach()
        cell = cell.detach()
    
        outputs, (hidden, cell) = model(x, (hidden, cell))
        loss = nn.CrossEntropyLoss()(outputs, targets)

        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

    if epoch % 1 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
               .format(epoch, epochs, loss.item(), np.exp(loss.item())))


RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #4 'mat1'

In [57]:
# intialise first hidden state
(hidden, cell) = (torch.zeros(1, 1, hidden_size), torch.zeros(1, 1, hidden_size))

for idx in input_idx:
    print("input word:", word_embeddings.wv.index2word[idx])
    output = embedded(torch.tensor([idx], dtype=torch.long)).view(1, 1, 100)
    output, (hidden, cell) = lstm(output, (hidden, cell))
    output = softmax(out(output[0]))
    predicted_idx = np.argmax(output.detach().numpy())
    print("predicted_class: {0}".format(word_embeddings.wv.index2word[predicted_idx]))

input word: <unk>
predicted_class: <sos>
input word: with
predicted_class: at
input word: the
predicted_class: were
input word: $
predicted_class: do
input word: chairman
predicted_class: shares
input word: N
predicted_class: its
input word: the
predicted_class: were
input word: rewards
predicted_class: its
input word: <pad>
predicted_class: shares
input word: indicators
predicted_class: co.
input word: chips
predicted_class: its
input word: co.
predicted_class: shares
input word: <sos>
predicted_class: &
