# Conditioned LSTM

In [1]:
import math
import random

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



## Data

In [2]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [3]:
x_arr = np.load('data/x.npy')
y_arr = np.load('data/y.npy')

In [4]:
x_arr.shape

(5841, 100)

In [5]:
valid_split = 0.2

In [6]:
dataset_len = len(x_arr)
ids = list(range(dataset_len))

In [7]:
val_len = int(np.floor(valid_split * dataset_len))
val_ids = np.random.choice(ids, size=val_len, replace=False)

In [8]:
x_val = np.array(x_arr)[val_ids] 
y_val = np.array(y_arr)[val_ids]

In [9]:
x_val.shape, y_val.shape

((1168, 100), (1168, 100))

In [10]:
x_train = np.delete(x_arr, val_ids, axis=0)
y_train = np.delete(y_arr, val_ids, axis=0)

In [11]:
x_train.shape, y_train.shape

((4673, 100), (4673, 100))

In [12]:
def batches(x_arr, y_arr, batch_size):
    
    pos = 0
    
    for n in range(batch_size, x_arr.shape[0], batch_size):
        x = x_arr[pos:n]
        y = y_arr[pos:n]
        pos = n
        
        yield x, y

In [13]:
x_data = {'train': x_train , 'val': x_val}
y_data = {'train': y_train , 'val': y_val}

## Word Embeddings

### Word2Vec

In [14]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [15]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(17862, 100)

In [16]:
embedding_size = w2v_model.wv.vectors.shape[1]
embedding_size

100

In [17]:
w2v_tensors = torch.FloatTensor(w2v_model.wv.vectors)

### ngram Word2Vec

## Topic Modelling

In [18]:
# model hyperparams used as params for LSA and PCA
n_hidden = 256
batch_size = 64
n_layers = 3

### Corpus

In [19]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

In [20]:
len(dct)

5485

In [21]:
print(dct)

Dictionary(5485 unique tokens: ['across', 'all', 'annotation', 'arabic', 'baselines']...)


In [22]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [23]:
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)

In [24]:
lsi.show_topics(5)

[(0,
  '0.862*"e" + 0.293*"de" + 0.163*"d" + 0.125*"les" + 0.122*"la" + 0.121*"des" + 0.108*"l" + 0.095*"et" + 0.087*"s" + 0.081*"le"'),
 (1,
  '0.177*"word" + 0.151*"text" + 0.137*"learning" + 0.135*"using" + 0.134*"information" + 0.132*"it" + 0.130*"performance" + 0.126*"tasks" + 0.122*"training" + 0.121*"or"'),
 (2,
  '-0.777*"word" + -0.237*"embeddings" + -0.202*"words" + 0.142*"text" + -0.107*"languages" + 0.100*"domain" + 0.096*"knowledge" + 0.094*"dataset" + 0.091*"question" + 0.083*"training"'),
 (3,
  '0.663*"translation" + 0.271*"english" + 0.271*"machine" + 0.168*"nmt" + 0.158*"languages" + -0.126*"information" + 0.118*"parallel" + 0.117*"mt" + -0.110*"knowledge" + 0.096*"source"'),
 (4,
  '-0.305*"corpus" + 0.185*"neural" + 0.184*"training" + -0.180*"languages" + 0.162*"translation" + -0.154*"speech" + 0.152*"tasks" + 0.144*"propose" + -0.135*"system" + 0.134*"sentence"')]

In [25]:
lsi.show_topic(3, topn=10)

[('translation', 0.6633031598226204),
 ('english', 0.2709285453655678),
 ('machine', 0.27051568908349555),
 ('nmt', 0.1682297018972616),
 ('languages', 0.1579911776102907),
 ('information', -0.12648798661584168),
 ('parallel', 0.11843316153211839),
 ('mt', 0.11745082696863827),
 ('knowledge', -0.11032758426646193),
 ('source', 0.09550392386188096)]

In [26]:
trans_topics = np.transpose(lsi.projection.u)

### Principal Component Analysis (PCA)

In [27]:
pca_topics = PCA(n_components=n_layers*batch_size, svd_solver='full').fit_transform(trans_topics)

In [28]:
pca_trans = np.transpose(pca_topics)

## Models

### LSTM

In [83]:
class LSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, embedding_size)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.zeros(self.n_layers, batch_size, self.n_hidden),
                 torch.zeros(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [84]:
# instantiate the model
lstm = LSTM(n_hidden=n_hidden, n_layers=n_layers)

print(lstm)

LSTM(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


### LSTM + Word2Vec

In [85]:
class LSTM_Word2Vec(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.zeros(self.n_layers, batch_size, self.n_hidden),
                 torch.zeros(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [86]:
# instantiate the model
lstm_w2v = LSTM_Word2Vec(n_hidden=n_hidden, n_layers=n_layers, drop_prob=0)

print(lstm_w2v)

LSTM_Word2Vec(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


### Conditioned LSTM

In [87]:
class Conditioned_LSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, embedding_size)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.zeros(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [88]:
# instantiate the model
cond_lstm = Conditioned_LSTM(n_hidden=n_hidden, n_layers=n_layers)

print(cond_lstm)

Conditioned_LSTM(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


### Condtiioned LSTM + Word2Vec

In [89]:
class Conditioned_LSTM_Word2Vec(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.zeros(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [90]:
# instantiate the model
cond_lstm_w2v = Conditioned_LSTM_Word2Vec(n_hidden=n_hidden, n_layers=n_layers, drop_prob=0)

print(cond_lstm_w2v)

Conditioned_LSTM_Word2Vec(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


## Training

In [91]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss criterion
    criterion = nn.CrossEntropyLoss()
    
    # loss values
    train_loss = []
    val_loss = []
    
    # dict to assing to right loss list according to phase
    losses = {'train': train_loss , 'val': val_loss}

    for e in range(epochs):
        
        for phase in ['train', 'val']:
            model.train(True) if phase == 'train' else model.train(False)
            
            batch = 0

            train_epoch_loss = []
            val_epoch_loss = []
            
            # dict to assing to right epoch loss list according to phase
            epoch_loss = {'train': train_epoch_loss , 'val': val_epoch_loss} 

            for x, y in batches(x_data[phase], y_data[phase], batch_size):

                batch += 1

                # initialize hidden state
                h = model.init_hidden(batch_size)

                # convert numpy arrays to PyTorch arrays
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                # detach hidden states
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                model.zero_grad()

                # get the output from the model
                output, h = model(inputs, h)

                # calculate the loss
                loss = criterion(output, targets.view(-1).long())

                if phase == 'train':
                    # back-propagate error
                    loss.backward()

                    # `clip_grad_norm` helps prevent exploding gradient
                    nn.utils.clip_grad_norm_(model.parameters(), clip)

                    # update weigths
                    opt.step()
                
                # add current batch loss to epoch loss list
                epoch_loss[phase].append(loss.item())

                # show epoch - batch - loss every n batches
                if batch % print_every == 0:

                    tot_batches = int(x_data[phase].shape[0] / batch_size)

                    print("Epoch: {}/{} -".format(e+1, epochs),
                          "Batch: {}/{} -".format(batch, tot_batches),
                          "{} loss: {:.5f}".format(phase.capitalize(), loss))
                    
            # calculate average epoch loss
            avg_epoch_loss = sum(epoch_loss[phase])/len(epoch_loss[phase])
                    
            # print average train and val loss at the end of each epoch
            print("\nEpoch: {}/{} -".format(e+1, epochs),
                  "Average {} loss: {:.5f}\n".format(phase, avg_epoch_loss))           

            # save average epoch loss for training and validation
            losses[phase].append(avg_epoch_loss)

    return train_loss, val_loss

In [36]:
epochs = 10
path = 'weights/cond_lstm.pt'
loss = 0.2

optimizer = torch.optim.Adam(cond_lstm.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [37]:
# save checkpoints
torch.save({
            'epoch': epochs,
            'model_state_dict': cond_lstm.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion
            }, path)

  "type " + obj.__name__ + ". It won't be checked "


In [53]:
# ones
train_loss, val_loss = train(cond_lstm, batch_size=batch_size, epochs=20, print_every=5)

Epoch: 1/20 - Batch: 5/73 - Train loss: 6.98827
Epoch: 1/20 - Batch: 10/73 - Train loss: 6.88689
Epoch: 1/20 - Batch: 15/73 - Train loss: 6.75206
Epoch: 1/20 - Batch: 20/73 - Train loss: 6.93269
Epoch: 1/20 - Batch: 25/73 - Train loss: 6.92339
Epoch: 1/20 - Batch: 30/73 - Train loss: 6.88088
Epoch: 1/20 - Batch: 35/73 - Train loss: 6.93053
Epoch: 1/20 - Batch: 40/73 - Train loss: 6.85434
Epoch: 1/20 - Batch: 45/73 - Train loss: 6.80789
Epoch: 1/20 - Batch: 50/73 - Train loss: 6.81354
Epoch: 1/20 - Batch: 55/73 - Train loss: 6.84916
Epoch: 1/20 - Batch: 60/73 - Train loss: 7.45116
Epoch: 1/20 - Batch: 65/73 - Train loss: 6.81456
Epoch: 1/20 - Batch: 70/73 - Train loss: 7.00605

Epoch: 1/20 - Average train loss: 6.96200

Epoch: 1/20 - Batch: 5/18 - Val loss: 6.92356
Epoch: 1/20 - Batch: 10/18 - Val loss: 7.06453
Epoch: 1/20 - Batch: 15/18 - Val loss: 7.02465

Epoch: 1/20 - Average val loss: 6.95191

Epoch: 2/20 - Batch: 5/73 - Train loss: 6.87585
Epoch: 2/20 - Batch: 10/73 - Train loss: 

In [34]:
# ones
train_loss_w2v, val_loss_w2v = train(cond_lstm_w2v, batch_size=batch_size, epochs=20, print_every=5)

Epoch: 1/20 - Batch: 5/73 - Train loss: 9.62338
Epoch: 1/20 - Batch: 10/73 - Train loss: 8.07467
Epoch: 1/20 - Batch: 15/73 - Train loss: 7.10355
Epoch: 1/20 - Batch: 20/73 - Train loss: 6.98821
Epoch: 1/20 - Batch: 25/73 - Train loss: 7.01397
Epoch: 1/20 - Batch: 30/73 - Train loss: 6.96979
Epoch: 1/20 - Batch: 35/73 - Train loss: 6.98377
Epoch: 1/20 - Batch: 40/73 - Train loss: 6.90053
Epoch: 1/20 - Batch: 45/73 - Train loss: 6.84488
Epoch: 1/20 - Batch: 50/73 - Train loss: 6.85529
Epoch: 1/20 - Batch: 55/73 - Train loss: 6.88553
Epoch: 1/20 - Batch: 60/73 - Train loss: 7.63706
Epoch: 1/20 - Batch: 65/73 - Train loss: 6.86007
Epoch: 1/20 - Batch: 70/73 - Train loss: 7.03847

Epoch: 1/20 - Average train loss: 7.37194

Epoch: 1/20 - Batch: 5/18 - Val loss: 6.96791
Epoch: 1/20 - Batch: 10/18 - Val loss: 7.08659
Epoch: 1/20 - Batch: 15/18 - Val loss: 7.06550

Epoch: 1/20 - Average val loss: 6.98385

Epoch: 2/20 - Batch: 5/73 - Train loss: 7.02748
Epoch: 2/20 - Batch: 10/73 - Train loss: 

In [54]:
# ones (50)
train_loss_w2v, val_loss_w2v = train(cond_lstm_w2v, batch_size=batch_size, epochs=50, print_every=9)

Epoch: 1/50 - Batch: 9/73 - Train loss: 5.06064
Epoch: 1/50 - Batch: 18/73 - Train loss: 5.31354
Epoch: 1/50 - Batch: 27/73 - Train loss: 5.05585
Epoch: 1/50 - Batch: 36/73 - Train loss: 5.09107
Epoch: 1/50 - Batch: 45/73 - Train loss: 4.96276
Epoch: 1/50 - Batch: 54/73 - Train loss: 4.85034
Epoch: 1/50 - Batch: 63/73 - Train loss: 4.96236
Epoch: 1/50 - Batch: 72/73 - Train loss: 4.96441

Epoch: 1/50 - Average train loss: 5.10028

Epoch: 1/50 - Batch: 9/18 - Val loss: 5.55602
Epoch: 1/50 - Batch: 18/18 - Val loss: 5.53312

Epoch: 1/50 - Average val loss: 5.55398

Epoch: 2/50 - Batch: 9/73 - Train loss: 4.95294
Epoch: 2/50 - Batch: 18/73 - Train loss: 5.18676
Epoch: 2/50 - Batch: 27/73 - Train loss: 4.95267
Epoch: 2/50 - Batch: 36/73 - Train loss: 4.98276
Epoch: 2/50 - Batch: 45/73 - Train loss: 4.87594
Epoch: 2/50 - Batch: 54/73 - Train loss: 4.76573
Epoch: 2/50 - Batch: 63/73 - Train loss: 4.86437
Epoch: 2/50 - Batch: 72/73 - Train loss: 4.87721

Epoch: 2/50 - Average train loss: 4.99

### Training ...

In [38]:
train_loss_base, val_loss_base = train(lstm, batch_size=batch_size, epochs=10, print_every=9)

Epoch: 1/10 - Batch: 9/73 - Train loss: 8.46450
Epoch: 1/10 - Batch: 18/73 - Train loss: 7.20999
Epoch: 1/10 - Batch: 27/73 - Train loss: 7.08247
Epoch: 1/10 - Batch: 36/73 - Train loss: 7.03150
Epoch: 1/10 - Batch: 45/73 - Train loss: 6.88186
Epoch: 1/10 - Batch: 54/73 - Train loss: 6.87250
Epoch: 1/10 - Batch: 63/73 - Train loss: 6.84805
Epoch: 1/10 - Batch: 72/73 - Train loss: 6.87861

Epoch: 1/10 - Average train loss: 7.43174

Epoch: 1/10 - Batch: 9/18 - Val loss: 7.05984
Epoch: 1/10 - Batch: 18/18 - Val loss: 7.06798

Epoch: 1/10 - Average val loss: 7.00296

Epoch: 2/10 - Batch: 9/73 - Train loss: 6.86131
Epoch: 2/10 - Batch: 18/73 - Train loss: 7.04414
Epoch: 2/10 - Batch: 27/73 - Train loss: 6.94994
Epoch: 2/10 - Batch: 36/73 - Train loss: 6.95191
Epoch: 2/10 - Batch: 45/73 - Train loss: 6.82791
Epoch: 2/10 - Batch: 54/73 - Train loss: 6.82035
Epoch: 2/10 - Batch: 63/73 - Train loss: 6.80660
Epoch: 2/10 - Batch: 72/73 - Train loss: 6.83378

Epoch: 2/10 - Average train loss: 6.99

In [40]:
train_loss_base_w2v, val_loss_base_w2v = train(lstm_w2v, batch_size=batch_size, epochs=20, print_every=9)

Epoch: 1/20 - Batch: 9/73 - Train loss: 6.83064
Epoch: 1/20 - Batch: 18/73 - Train loss: 7.16712
Epoch: 1/20 - Batch: 27/73 - Train loss: 6.99470
Epoch: 1/20 - Batch: 36/73 - Train loss: 6.96883
Epoch: 1/20 - Batch: 45/73 - Train loss: 6.83091
Epoch: 1/20 - Batch: 54/73 - Train loss: 6.81606
Epoch: 1/20 - Batch: 63/73 - Train loss: 6.79231
Epoch: 1/20 - Batch: 72/73 - Train loss: 6.81642

Epoch: 1/20 - Average train loss: 7.05078

Epoch: 1/20 - Batch: 9/18 - Val loss: 7.05578
Epoch: 1/20 - Batch: 18/18 - Val loss: 7.06100

Epoch: 1/20 - Average val loss: 6.99340

Epoch: 2/20 - Batch: 9/73 - Train loss: 6.77742
Epoch: 2/20 - Batch: 18/73 - Train loss: 6.97837
Epoch: 2/20 - Batch: 27/73 - Train loss: 6.89206
Epoch: 2/20 - Batch: 36/73 - Train loss: 6.88186
Epoch: 2/20 - Batch: 45/73 - Train loss: 6.78793
Epoch: 2/20 - Batch: 54/73 - Train loss: 6.77578
Epoch: 2/20 - Batch: 63/73 - Train loss: 6.76455
Epoch: 2/20 - Batch: 72/73 - Train loss: 6.78660

Epoch: 2/20 - Average train loss: 6.93

In [None]:
train_loss_base_w2v, val_loss_base_w2v = train(lstm_w2v, batch_size=batch_size, epochs=20, print_every=9)

In [None]:
train_loss, val_loss = train(cond_lstm, batch_size=batch_size, epochs=20, print_every=9)

In [None]:
train_loss_w2v, val_loss_w2v = train(cond_lstm_w2v, batch_size=batch_size, epochs=20, print_every=9)

### Saving / Loading

In [55]:
torch.save(cond_lstm_w2v.state_dict(), path)

In [56]:
cond_lstm_w2v.load_state_dict(torch.load(path))

<All keys matched successfully>

In [57]:
cond_lstm_w2v.eval()

CondLSTM_Word2Vec(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)

In [42]:
for param_tensor in cond_lstm_w2v.state_dict():
    print(param_tensor, "\t", cond_lstm_w2v.state_dict()[param_tensor].size())

emb_layer.weight 	 torch.Size([17862, 100])
lstm.weight_ih_l0 	 torch.Size([1024, 100])
lstm.weight_hh_l0 	 torch.Size([1024, 256])
lstm.bias_ih_l0 	 torch.Size([1024])
lstm.bias_hh_l0 	 torch.Size([1024])
lstm.weight_ih_l1 	 torch.Size([1024, 256])
lstm.weight_hh_l1 	 torch.Size([1024, 256])
lstm.bias_ih_l1 	 torch.Size([1024])
lstm.bias_hh_l1 	 torch.Size([1024])
lstm.weight_ih_l2 	 torch.Size([1024, 256])
lstm.weight_hh_l2 	 torch.Size([1024, 256])
lstm.bias_ih_l2 	 torch.Size([1024])
lstm.bias_hh_l2 	 torch.Size([1024])
fc.weight 	 torch.Size([17862, 256])
fc.bias 	 torch.Size([17862])


## Generation

In [43]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [44]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[word_to_id(t)]])
    inputs = torch.from_numpy(x)

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data
    
    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top n values
    top_ids = p.argsort()[-10:][::-1]

    # sample id of next word from top n values
    next_id = top_ids[random.sample([0,1,2,3,4,5,6,7,8,9],1)[0]]

    # return the value of the predicted word and the hidden state
    return id_to_word(next_id), h

In [45]:
gen_batch_size = 1

In [46]:
# PCA for generation batch size
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [47]:
# function to generate text
def generate(model=cond_lstm, n=10, prompt='in this paper'):
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    words = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    words.append(token)

    # predict subsequent tokens
    for i in range(n-1):
        token, h = predict(model, words[-1], h)
        words.append(token)

    return ' '.join(words)

In [48]:
generate()

'in this paper comprehended sslc avoids mwetoolkit mwetoolkit acknowledge there tta there abbreviation'

In [74]:
generate(model=cond_lstm, n=50)

'in this paper show this system for automatic and syntactic representations as the word between text we also use two datasets that are able to achieve more useful performance on both languages with other domains to extract different sentences we evaluate an algorithm for predicting this analysis of text and then show in'

In [77]:
generate(model=cond_lstm_w2v, n=100, prompt='here we propose a better approach to')

'here we propose a better approach to this work on a diverse data annotated using large corpora for a common language we present a novel corpus in particular in an unambiguous domain specific language learning however existing tools is expensive to obtain and its impact of insufficient quality we consider them we explore how two categories derived structures are calculated adjectives we observethat a given text corpus crawled on three languages i i are a challenging domain the paper is one of them into any popular real world world data with this data from different sources including a number in a standardized repository to users it is'

## Evaluation

In [65]:
models = {lstm: 'LSTM', lstm_w2v: 'LSTM + Word2Vec', cond_lstm: 'Conditioned LSTM', cond_lstm_w2v: 'Conditioned LSTM + Word2Vec'}
loss = {lstm: val_loss_base, lstm_cond_lstm: val_loss_base_w2v, cond_lstm: val_loss, cond_lstm_w2v: val_loss_w2v}

In [73]:
# get minimimum validation loss within a set num of epochs
def min_val_loss(model, max_epochs=100):
    return min(loss[model][:max_epochs])

In [72]:
for m in models.keys():
    print("Minimum validation loss for {}: {:.5f}".format(models[m], min_val_loss(m, 50)))
    print("Perplexity for model {}: {:.2f}\n".format(models[m], math.exp(min_val_loss(m))))

Minimum validation loss for Conditioned LSTM: 5.66657
Perplexity for model Conditioned LSTM: 289.04

Minimum validation loss for Conditioned LSTM + Word2Vec: 5.53955
Perplexity for model Conditioned LSTM + Word2Vec: 254.56

