# Conditioned LSTM

In [1]:
import math
import random
import string

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



## Data

In [2]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [3]:
x_arr = np.load('data/x.npy')
y_arr = np.load('data/y.npy')

In [4]:
x_arr.shape

(11029, 159)

In [5]:
valid_split = 0.2

In [6]:
dataset_len = len(x_arr)
ids = list(range(dataset_len))

In [7]:
val_len = int(np.floor(valid_split * dataset_len))
val_ids = np.random.choice(ids, size=val_len, replace=False)

In [8]:
x_val = np.array(x_arr)[val_ids] 
y_val = np.array(y_arr)[val_ids]

In [9]:
x_val.shape, y_val.shape

((2205, 159), (2205, 159))

In [10]:
x_train = np.delete(x_arr, val_ids, axis=0)
y_train = np.delete(y_arr, val_ids, axis=0)

In [11]:
x_train.shape, y_train.shape

((8824, 159), (8824, 159))

In [12]:
def batches(x_arr, y_arr, batch_size):
    
    pos = 0
    
    for n in range(batch_size, x_arr.shape[0], batch_size):
        x = x_arr[pos:n]
        y = y_arr[pos:n]
        pos = n
        
        yield x, y

In [13]:
x_data = {'train': x_train , 'val': x_val}
y_data = {'train': y_train , 'val': y_val}

## Word Embeddings

In [14]:
w2v_model = gensim.models.KeyedVectors.load('w2v.model', mmap='r')

In [15]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(28674, 128)

In [16]:
embedding_size = w2v_model.wv.vectors.shape[1]
embedding_size

128

In [17]:
w2v_tensors = torch.FloatTensor(w2v_model.wv.vectors)

## Topic Modelling

### Parameters

In [18]:
# model parameters used for LSA and PCA
n_hidden = 256
batch_size = 64
n_layers = 3

### Corpus

In [19]:
# remove punctuation from tokenized abstracts
for abstract in tokenized:
    abstract[:] = [word for word in abstract if word not in string.punctuation]

In [20]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.2)

In [21]:
len(dct)

8457

In [22]:
print(dct)

Dictionary(8457 unique tokens: ['across', 'all', 'annotation', 'arabic', 'baselines']...)


In [23]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [24]:
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)

In [25]:
lsi.show_topics(5)

[(0,
  '-0.856*"e" + -0.311*"de" + -0.160*"d" + -0.134*"la" + -0.124*"les" + -0.124*"des" + -0.098*"et" + -0.096*"l" + -0.087*"s" + -0.076*"r"'),
 (1,
  '0.153*"translation" + 0.137*"system" + 0.126*"languages" + 0.117*"method" + 0.117*"dataset" + 0.114*"english" + 0.113*"corpus" + 0.113*"knowledge" + 0.110*"methods" + 0.109*"systems"'),
 (2,
  '0.672*"translation" + 0.274*"machine" + 0.259*"english" + 0.193*"languages" + 0.167*"nmt" + -0.151*"knowledge" + -0.104*"semantic" + 0.095*"parallel" + 0.093*"mt" + 0.092*"systems"'),
 (3,
  '0.359*"languages" + 0.353*"corpus" + -0.289*"translation" + -0.199*"knowledge" + 0.158*"speech" + -0.148*"sentence" + -0.140*"attention" + 0.134*"annotation" + -0.124*"question" + -0.111*"generation"'),
 (4,
  '0.397*"system" + -0.308*"languages" + -0.290*"embeddings" + 0.219*"systems" + 0.202*"human" + 0.169*"dialogue" + -0.163*"words" + -0.149*"representations" + -0.147*"cross" + 0.142*"evaluation"')]

In [26]:
lsi.show_topic(2, topn=10)

[('translation', 0.6723360125025517),
 ('machine', 0.2736211480917252),
 ('english', 0.25912720188495086),
 ('languages', 0.19283504511699173),
 ('nmt', 0.16663990460887895),
 ('knowledge', -0.15052054479132088),
 ('semantic', -0.10360516785585246),
 ('parallel', 0.09502014078225775),
 ('mt', 0.09339068984252816),
 ('systems', 0.09234711163158948)]

In [27]:
trans_topics = np.transpose(lsi.projection.u)

### Principal Component Analysis (PCA)

In [28]:
pca_topics = PCA(n_components=n_layers*batch_size, svd_solver='full').fit_transform(trans_topics)

In [29]:
pca_trans = np.transpose(pca_topics)

## Models

### Conditioned LSTM

In [30]:
class Conditioned_LSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, embedding_size)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                  torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)))

        return hidden

In [31]:
# instantiate the model
cond_lstm = Conditioned_LSTM(n_hidden=n_hidden, n_layers=n_layers)

print(cond_lstm)

Conditioned_LSTM(
  (emb_layer): Embedding(28674, 128)
  (lstm): LSTM(128, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=28674, bias=True)
)


### Condtiioned LSTM + Word2Vec

In [32]:
class Conditioned_LSTM_Word2Vec(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                  torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)))

        return hidden

In [33]:
# instantiate the model
cond_lstm_w2v = Conditioned_LSTM_Word2Vec(n_hidden=n_hidden, n_layers=n_layers, drop_prob=0)

print(cond_lstm_w2v)

Conditioned_LSTM_Word2Vec(
  (emb_layer): Embedding(28674, 128)
  (lstm): LSTM(128, 256, num_layers=3, batch_first=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=28674, bias=True)
)


## Training

In [34]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss criterion
    criterion = nn.CrossEntropyLoss()
    
    # loss values
    train_loss = []
    val_loss = []
    
    # dict to assing to right loss list according to phase
    losses = {'train': train_loss , 'val': val_loss}

    for e in range(epochs):
        
        for phase in ['train', 'val']:
            model.train(True) if phase == 'train' else model.train(False)
            
            batch = 0

            train_epoch_loss = []
            val_epoch_loss = []
            
            # dict to assing to right epoch loss list according to phase
            epoch_loss = {'train': train_epoch_loss , 'val': val_epoch_loss} 

            for x, y in batches(x_data[phase], y_data[phase], batch_size):

                batch += 1

                # initialize hidden state
                h = model.init_hidden(batch_size)

                # convert numpy arrays to PyTorch arrays
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                # detach hidden states
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                model.zero_grad()

                # get the output from the model
                output, h = model(inputs, h)

                # calculate the loss
                loss = criterion(output, targets.view(-1).long())

                if phase == 'train':
                    # back-propagate error
                    loss.backward()

                    # `clip_grad_norm` helps prevent exploding gradient
                    nn.utils.clip_grad_norm_(model.parameters(), clip)

                    # update weigths
                    opt.step()
                
                # add current batch loss to epoch loss list
                epoch_loss[phase].append(loss.item())

                # show epoch - batch - loss every n batches
                if batch % print_every == 0:

                    tot_batches = int(x_data[phase].shape[0] / batch_size)

                    print("Epoch: {}/{} -".format(e+1, epochs),
                          "Batch: {}/{} -".format(batch, tot_batches),
                          "{} loss: {:.5f}".format(phase.capitalize(), loss))
                    
            # calculate average epoch loss
            avg_epoch_loss = sum(epoch_loss[phase])/len(epoch_loss[phase])
                    
            # print average train and val loss at the end of each epoch
            print("\nEpoch: {}/{} -".format(e+1, epochs),
                  "Average {} loss: {:.5f}\n".format(phase, avg_epoch_loss))           

            # save average epoch loss for training and validation
            losses[phase].append(avg_epoch_loss)

    return train_loss, val_loss

In [35]:
path = 'weights/training_checkpoints.pt'
epochs = 10

# optimizers
lstm_opt = torch.optim.Adam(cond_lstm.parameters(), lr=0.001)
lstm_w2v_opt = torch.optim.Adam(cond_lstm_w2v.parameters(), lr=0.001)

# loss criterion
criterion = nn.CrossEntropyLoss()

In [36]:
# save checkpoints
torch.save({
            'epoch': epochs,
            'loss': criterion,
            'lstm_state_dict': cond_lstm.state_dict(),
            'lstm_w2v_state_dict': cond_lstm_w2v.state_dict(),
            'lst_opt_state_dict': lstm_opt.state_dict(),
            'lst_opt_w2v_state_dict': lstm_w2v_opt.state_dict(),
            }, path)

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
train_loss_w2v, val_loss_w2v = train(cond_lstm_w2v, batch_size=batch_size, epochs=20, print_every=17)

In [None]:
train_loss, val_loss = train(cond_lstm, batch_size=batch_size, epochs=20, print_every=17)

### Saving / Loading

In [None]:
torch.save(cond_lstm_w2v.state_dict(), path)

In [None]:
cond_lstm_w2v.load_state_dict(torch.load(path))

In [None]:
cond_lstm_w2v.eval()

In [None]:
for param_tensor in cond_lstm_w2v.state_dict():
    print(param_tensor, "\t", cond_lstm_w2v.state_dict()[param_tensor].size())

## Generation

In [None]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [None]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[word_to_id(t)]])
    inputs = torch.from_numpy(x)

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data
    
    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top n values
    top_ids = p.argsort()[-10:][::-1]

    # sample id of next word from top n values
    next_id = top_ids[random.sample([0,1,2,3,4,5,6,7,8,9],1)[0]]

    # return the value of the predicted word and the hidden state
    return id_to_word(next_id), h

In [None]:
gen_batch_size = 1

In [None]:
# PCA for generation batch size
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [None]:
# function to generate text
def generate(model=cond_lstm, n=10, prompt='in this paper'):
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.zeros(n_layers, gen_batch_size, n_hidden))

    words = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    words.append(token)

    # predict subsequent tokens
    for i in range(n-1):
        token, h = predict(model, words[-1], h)
        words.append(token)

    return ' '.join(words)

In [None]:
generate()

In [None]:
generate(model=cond_lstm, n=50)

In [None]:
generate(model=cond_lstm_w2v, n=100, prompt='here we propose a better approach to')

## Evaluation

In [None]:
models = {cond_lstm: 'Conditioned LSTM', cond_lstm_w2v: 'Conditioned LSTM + Word2Vec'}
loss = {cond_lstm: val_loss, cond_lstm_w2v: val_loss_w2v}

In [None]:
# get minimimum validation loss within a set num of epochs
def min_val_loss(model, max_epochs=100):
    return min(loss[model][:max_epochs])

In [None]:
for m in models.keys():
    print("Minimum validation loss for {}: {:.5f}".format(models[m], min_val_loss(m, 50)))
    print("Perplexity for model {}: {:.2f}\n".format(models[m], math.exp(min_val_loss(m, 50))))