In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.nn.utils import clip_grad_norm_

torch.manual_seed(1)

<torch._C.Generator at 0x20bc1cc95f0>

In [2]:
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer

[nltk_data] Downloading package punkt to C:\Users\Mihaela
[nltk_data]     Stoycheva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# tokenizer = TweetTokenizer(preserve_case=False)
tok = MWETokenizer([('<', 'unk', '>')], separator = '')

train_data = []
vocabulary = []

with open("data/ptb.train.txt") as f:
    total_sentences = 0
    for i, line in enumerate(f):
        line = line.strip()
        s = word_tokenize(line)
        s = tok.tokenize(s)
        s = ['<sos>'] + s + ['<eos>']
        total_sentences += 1
        train_data.append(s)
        vocabulary += s

vocabulary = sorted(list(set(vocabulary)))
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

print("total sentences:", total_sentences)

total sentences: 42068


In [4]:
print(vocabulary[0:100])

['#', '$', '&', "'", "'80s", "'d", "'ll", "'m", "'re", "'s", "'ve", '.', '10-year', '100-share', '12-month', '12-year', '13-week', '13th', '14-year-old', '190-point', '190.58-point', '1920s', '1930s', '1950s', '1960s', '1970s', '1980s', '1990s', '19th', '1\\/2-year', '2-for-1', '20-year', '20th', '24-hour', '26-week', '30-day', '30-share', '30-year', '300-a-share', '300-day', '40-year-old', '45-year-old', '500-stock', '52-week', '<eos>', '<sos>', '<unk>', 'N', '\\*', '\\*\\*', 'a', 'a.', 'a.c.', 'a.g.', 'a.m', 'a.m.', 'a.p', 'ab', 'aba', 'abandon', 'abandoned', 'abandoning', 'abbie', 'abc', 'ability', 'able', 'abm', 'aboard', 'abolish', 'abolished', 'aborted', 'abortion', 'abortion-rights', 'abortions', 'about', 'above', 'abrams', 'abramson', 'abroad', 'abrupt', 'abruptly', 'absence', 'absolutely', 'absorb', 'absorbed', 'absurd', 'abundant', 'abuse', 'abused', 'abuses', 'academic', 'academy', 'acadia', 'accelerate', 'accelerated', 'accelerating', 'acceleration', 'accept', 'acceptable',

In [5]:
from gensim.models import Word2Vec
model1 = Word2Vec(train_data, min_count=1, size=100, window=5)
model1.train(train_data, len(train_data), epochs=100, total_examples=model1.corpus_count)

(66890574, 97292600)

In [6]:
model1.wv.most_similar("stock")

[('shares', 0.6124134659767151),
 ('stocks', 0.5938287973403931),
 ('equity', 0.5454078912734985),
 ('share', 0.49382197856903076),
 ('junk-bond', 0.4811207056045532),
 ('mercantile', 0.42205220460891724),
 ('plunge', 0.41612717509269714),
 ('junk', 0.4154773950576782),
 ('over-the-counter', 0.4082963168621063),
 ('options', 0.40443819761276245)]

In [7]:
model1.wv.most_similar(".")

[('f.', 0.430519163608551),
 ('l.', 0.4049091041088104),
 ('u.s.a.', 0.4036238193511963),
 ('s.', 0.3933088183403015),
 ('w.', 0.3931712806224823),
 ('h.', 0.3906556963920593),
 ('t.', 0.3863432705402374),
 ('m.', 0.38257697224617004),
 ('r.', 0.3808728754520416),
 ('jr.', 0.37582874298095703)]

In [8]:
model1.wv.most_similar("credit")

[('loans', 0.4597245454788208),
 ('debt', 0.4546087682247162),
 ('loan', 0.4374544322490692),
 ('financial', 0.3911444842815399),
 ('payment', 0.3867035508155823),
 ('capital', 0.3803357779979706),
 ('financing', 0.3711961507797241),
 ('citicorp', 0.3703234791755676),
 ('borrowing', 0.36570313572883606),
 ('credit-card', 0.36446505784988403)]

In [9]:
model1.wv['credit']

array([ 5.0183839e-01, -8.5344392e-01,  3.2377582e+00, -7.0825732e-01,
       -8.6280912e-01,  1.1251957e+00,  1.6312332e+00,  3.1292782e+00,
       -1.6751940e+00,  5.7619697e-01, -2.1294656e+00, -3.1530521e+00,
       -2.8109512e+00, -1.6537700e+00,  5.8846813e-01,  1.1687769e+00,
        1.2105019e+00,  2.8258069e+00, -5.5699086e-01, -8.1041855e-01,
       -4.6090314e-01,  2.0775390e+00,  3.2119966e+00, -1.4342258e+00,
        1.0262493e+00, -2.8396878e-01, -4.2756662e-01, -2.1494949e+00,
       -2.5634935e+00, -7.2201431e-01, -4.5823645e-02,  5.1110852e-01,
        4.5512161e-01, -1.9884517e+00,  3.3549728e+00,  2.2448552e+00,
        2.7554889e+00, -2.4954386e+00, -2.8861184e+00,  2.2421050e+00,
        2.0623786e+00, -1.7116361e+00, -5.9748119e-01, -1.8433223e+00,
        1.5207335e-01, -2.9780552e+00, -2.2002859e+00, -6.3473225e-01,
        4.2796385e-02,  7.9394716e-01,  2.9584658e-01,  3.0709546e+00,
       -1.0523658e+00, -3.1390479e-01,  3.2061848e-01, -2.5827928e+00,
      

In [10]:
vocabulary_size = len(vocabulary)
input_size = 100
output_size = 100
hidden_size = 10

In [11]:
input_word = 'money'
embedding_vec = model1.wv[input_word]

In [18]:
# make the word embeddings into a pythorch tensor
weights = torch.FloatTensor(model1.wv.vectors)

# NN MODEL
embedded = nn.Embedding.from_pretrained(weights) #input layer
lstm = nn.LSTM(input_size, hidden_size, num_layers=1) #lstm layer
out = nn.Linear(hidden_size, output_size)
softmax = nn.LogSoftmax(dim=1)

# loss_function = nn.NLLLoss()
# optimizer = optim.SGD( lr=0.1)

In [19]:
input_sentence = train_data[2]
input_idx = [model1.wv.vocab[x].index for x in input_sentence]

In [20]:
# intialise first hidden state
(hidden, cell) = (torch.zeros(1, 1, hidden_size), torch.zeros(1, 1, hidden_size))

for idx in input_idx:
    print("input word:", model1.wv.index2word[idx])
    output = embedded(torch.tensor([idx], dtype=torch.long)).view(1, 1, 100)
    output, (hidden, cell) = lstm(output, (hidden, cell))
    output = softmax(out(output[0]))
    predicted_idx = np.argmax(output.detach().numpy())
    print("predicted_class: {0}".format(model1.wv.index2word[predicted_idx]))

input word: <sos>
predicted_class: president
input word: mr.
predicted_class: market
input word: <unk>
predicted_class: he
input word: is
predicted_class: co.
input word: chairman
predicted_class: which
input word: of
predicted_class: co.
input word: <unk>
predicted_class: president
input word: n.v.
predicted_class: shares
input word: the
predicted_class: to
input word: dutch
predicted_class: last
input word: publishing
predicted_class: most
input word: group
predicted_class: only
input word: <eos>
predicted_class: president


In [21]:
class rnn_language_model(nn.Module):
    def __init__(self, vocabulary_size, input_size, hidden_size, num_layers, weights):
        super(rnn_language_model, self).__init__()
        self.embed = nn.Embedding.from_pretrained(weights)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, vocabulary_size)
        
    def forward(self, x, h):
        x = self.embed(torch.tensor(x, dtype=torch.long)).view(1, len(x), 100)
        out, (h, c) = self.lstm(x, h)
        out = out.reshape(out.size(0)*out.size(1), out.size(2))
        out = self.linear(out)
        return out, (h, c)

In [30]:
learning_rate = 0.0001
num_layers = 1
batch_size = 100
epochs = 100

model = rnn_language_model(vocabulary_size, input_size, hidden_size, num_layers, weights)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Fixed batch size - this should probably be changed later since the size will vary from sentence to sentence, not sure how...
def get_batch(data, batch_size):
    batches = []
    count = 0
    current_batch = []
    for i in range(len(data)):
        for j in range(len(data[i])):
            if count == batch_size + 1:
                batches.append(current_batch)
                current_batch = []
                count = 0
            else:
                current_batch.append(data[i][j])
                count += 1
    return batches  

batches = get_batch(train_data[1:100], batch_size)
print(len(batches))

21


In [31]:
# Train
for epoch in range(epochs):
    (hidden, cell) = (torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(num_layers, batch_size, hidden_size))
    for i in range(len(batches)):
        inputs = [model1.wv.vocab[x].index for x in batches[i][:-1]]
        targets = torch.tensor([model1.wv.vocab[x].index for x in batches[i][1:]], dtype=torch.long)

        hidden = hidden.detach()
        cell = cell.detach()
    
        outputs, (hidden, cell) = model(inputs, (hidden, cell))
        loss = nn.CrossEntropyLoss()(outputs, targets)

        model.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

    if epoch % 10 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
               .format(epoch, epochs, loss.item(), np.exp(loss.item())))


Epoch [0/100], Loss: 9.3038, Perplexity: 10979.92
Epoch [10/100], Loss: 9.2138, Perplexity: 10035.12
Epoch [20/100], Loss: 9.0911, Perplexity: 8875.58
Epoch [30/100], Loss: 8.9311, Perplexity: 7563.61
Epoch [40/100], Loss: 8.7241, Perplexity: 6149.08
Epoch [50/100], Loss: 8.4693, Perplexity: 4766.16
Epoch [60/100], Loss: 8.1760, Perplexity: 3554.47
Epoch [70/100], Loss: 7.8569, Perplexity: 2583.50
Epoch [80/100], Loss: 7.5062, Perplexity: 1819.37
Epoch [90/100], Loss: 7.1628, Perplexity: 1290.56
