# Conditioned LSTM

In [1]:
import math
import random

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



## Data

In [2]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [3]:
x_arr = np.load('data/x.npy')
y_arr = np.load('data/y.npy')

In [4]:
x_arr.shape

(5841, 100)

In [5]:
valid_split = 0.2

In [6]:
dataset_len = len(x_arr)
ids = list(range(dataset_len))

In [7]:
val_len = int(np.floor(valid_split * dataset_len))
val_ids = np.random.choice(ids, size=val_len, replace=False)

In [8]:
x_val = np.array(x_arr)[val_ids] 
y_val = np.array(y_arr)[val_ids]

In [9]:
x_val.shape, y_val.shape

((1168, 100), (1168, 100))

In [10]:
x_train = np.delete(x_arr, val_ids, axis=0)
y_train = np.delete(y_arr, val_ids, axis=0)

In [11]:
x_train.shape, y_train.shape

((4673, 100), (4673, 100))

In [12]:
def batches(x_arr, y_arr, batch_size):
    
    pos = 0
    
    for n in range(batch_size, x_arr.shape[0], batch_size):
        x = x_arr[pos:n]
        y = y_arr[pos:n]
        pos = n
        
        yield x, y

In [13]:
x_data = {'train': x_train , 'val': x_val}
y_data = {'train': y_train , 'val': y_val}

## Word2Vec

In [14]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [15]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size, emdedding_size

(17862, 100)

In [16]:
embedding_size = w2v_model.wv.vectors.shape[1]
embedding_size

100

In [17]:
w2v_tensors = torch.FloatTensor(w2v_model.wv.vectors)

## Topic Modelling

In [18]:
# model hyperparams used as params for LSA and PCA
n_hidden = 256
batch_size = 64
n_layers = 3

### Corpus

In [19]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

In [20]:
len(dct)

5485

In [21]:
print(dct)

Dictionary(5485 unique tokens: ['across', 'all', 'annotation', 'arabic', 'baselines']...)


In [22]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [23]:
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)

In [24]:
lsi.show_topics(5)

[(0,
  '-0.862*"e" + -0.293*"de" + -0.163*"d" + -0.125*"les" + -0.122*"la" + -0.121*"des" + -0.108*"l" + -0.095*"et" + -0.087*"s" + -0.081*"le"'),
 (1,
  '0.177*"word" + 0.151*"text" + 0.137*"learning" + 0.135*"using" + 0.134*"information" + 0.132*"it" + 0.130*"performance" + 0.126*"tasks" + 0.122*"training" + 0.121*"or"'),
 (2,
  '-0.777*"word" + -0.237*"embeddings" + -0.202*"words" + 0.142*"text" + -0.107*"languages" + 0.100*"domain" + 0.096*"knowledge" + 0.094*"dataset" + 0.091*"question" + 0.083*"training"'),
 (3,
  '0.663*"translation" + 0.271*"english" + 0.270*"machine" + 0.168*"nmt" + 0.158*"languages" + -0.126*"information" + 0.118*"parallel" + 0.117*"mt" + -0.110*"knowledge" + 0.096*"source"'),
 (4,
  '-0.305*"corpus" + 0.185*"neural" + 0.184*"training" + -0.180*"languages" + 0.162*"translation" + -0.154*"speech" + 0.152*"tasks" + 0.144*"propose" + -0.135*"system" + 0.134*"sentence"')]

In [25]:
lsi.show_topic(3, topn=10)

[('translation', 0.6633289995938222),
 ('english', 0.2709146302982518),
 ('machine', 0.2704907609861921),
 ('nmt', 0.16818859043068632),
 ('languages', 0.15798442798251336),
 ('information', -0.12648352967900076),
 ('parallel', 0.1184804697673785),
 ('mt', 0.11745677543873267),
 ('knowledge', -0.11031036182134815),
 ('source', 0.09550332661415212)]

In [26]:
trans_topics = np.transpose(lsi.projection.u)

### Principal Component Analysis (PCA)

In [27]:
pca_topics = PCA(n_components=n_layers*batch_size, svd_solver='full').fit_transform(trans_topics)

In [28]:
pca_trans = np.transpose(pca_topics)

## Models

### Conditioned LSTM

In [29]:
class ConditionedLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, embedding_size)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [30]:
# instantiate the model
cond_lstm = ConditionedLSTM(n_hidden=n_hidden, n_layers=n_layers)

print(cond_lstm)

ConditionedLSTM(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


### Condtiioned LSTM + Word2Vec

In [31]:
class CondLSTM_Word2Vec(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        # LSTM
        self.lstm = nn.LSTM(embedding_size, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        # pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        # get outputs and new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        # pass through a dropout layer
        out = self.dropout(lstm_output)
        
        # flatten out
        out = out.reshape(-1, self.n_hidden) 

        # put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [32]:
# instantiate the model
cond_lstm_w2v = CondLSTM_Word2Vec(n_hidden=n_hidden, n_layers=n_layers, drop_prob=0)

print(cond_lstm_w2v)

CondLSTM_Word2Vec(
  (emb_layer): Embedding(17862, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=17862, bias=True)
)


## Training

In [33]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss criterion
    criterion = nn.CrossEntropyLoss()
    
    # loss values
    train_loss = []
    val_loss = []
    
    # dict to assing to right loss list according to phase
    losses = {'train': train_loss , 'val': val_loss}

    for e in range(epochs):
        
        for phase in ['train', 'val']:
            model.train(True) if phase == 'train' else model.train(False)
            
            batch = 0

            train_epoch_loss = []
            val_epoch_loss = []
            
            # dict to assing to right epoch loss list according to phase
            epoch_loss = {'train': train_epoch_loss , 'val': val_epoch_loss} 

            for x, y in batches(x_data[phase], y_data[phase], batch_size):

                batch += 1

                # initialize hidden state
                h = model.init_hidden(batch_size)

                # convert numpy arrays to PyTorch arrays
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

                # detach hidden states
                h = tuple([each.data for each in h])

                # zero accumulated gradients
                model.zero_grad()

                # get the output from the model
                output, h = model(inputs, h)

                # calculate the loss
                loss = criterion(output, targets.view(-1).long())

                if phase == 'train':
                    # back-propagate error
                    loss.backward()

                    # `clip_grad_norm` helps prevent exploding gradient
                    nn.utils.clip_grad_norm_(model.parameters(), clip)

                    # update weigths
                    opt.step()
                
                # add current batch loss to epoch loss list
                epoch_loss[phase].append(loss.item())

                # show epoch - batch - loss every n batches
                if batch % print_every == 0:

                    tot_batches = int(x_data[phase].shape[0] / batch_size)

                    print("Epoch: {}/{} -".format(e+1, epochs),
                          "Batch: {}/{} -".format(batch, tot_batches),
                          "{} loss: {:.5f}".format(phase.capitalize(), loss))
                    
            # calculate average epoch loss
            avg_epoch_loss = sum(epoch_loss[phase])/len(epoch_loss[phase])
                    
            # print average train and val loss at the end of each epoch
            print("\nEpoch: {}/{} -".format(e+1, epochs),
                  "Average {} loss: {:.5f}\n".format(phase, avg_epoch_loss))           

            # save average epoch loss for training and validation
            losses[phase].append(avg_epoch_loss)

    return train_loss, val_loss

In [34]:
epochs = 10
path = 'weights/cond_lstm_val.pt'
loss = 0.2

optimizer = torch.optim.Adam(cond_lstm.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [35]:
# save checkpoints
torch.save({
            'epoch': epochs,
            'model_state_dict': cond_lstm.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion
            }, path)

  "type " + obj.__name__ + ". It won't be checked "


In [36]:
train_loss, val_loss = train(cond_lstm, batch_size=batch_size, epochs=20, print_every=5)

Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Epoch: 1/20 - Batch: 5/73 - Train loss: 9.66679
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Epoch: 1/20 - Batch: 10/73 - Train loss: 8.05498
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Epoch: 1/20 - Batch: 15/73 - Train loss: 7.17266
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Epoch: 1/20 - Batch: 20/73 - Train loss: 7.06228
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Updating loss after training
Epoch: 1/20 - Batch: 25/73 - Train loss: 7.07005
Updating loss after training
Up

KeyboardInterrupt: 

In [None]:
train_loss_w2v, val_loss_w2v = train(cond_lstm_w2v, batch_size=batch_size, epochs=20, print_every=5)

In [None]:
loss = min(val_loss)

6.8643065558539496

In [55]:
perplexity = math.exp(loss)
perplexity

957.4816494960307

### Saving / Loading

In [None]:
torch.save(cond_lstm_w2v.state_dict(), path)

In [None]:
cond_lstm_w2v.load_state_dict(torch.load(path))

In [None]:
cond_lstm_w2v.eval()

In [None]:
for param_tensor in cond_lstm_w2v.state_dict():
    print(param_tensor, "\t", cond_lstm_w2v.state_dict()[param_tensor].size())

## Generation

In [38]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

In [39]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[word_to_id(t)]])
    inputs = torch.from_numpy(x)

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data
    
    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top n values
    top_ids = p.argsort()[-10:][::-1]

    # sample id of next word from top n values
    next_id = top_ids[random.sample([0,1,2,3,4,5,6,7,8,9],1)[0]]

    # return the value of the predicted word and the hidden state
    return id_to_word(next_id), h

In [40]:
gen_batch_size = 1

In [41]:
# PCA for generation batch size
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [42]:
# function to generate text
def generate(model=cond_lstm, n=10, prompt='in this paper'):
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    words = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    words.append(token)

    # predict subsequent tokens
    for i in range(n-1):
        token, h = predict(model, words[-1], h)
        words.append(token)

    return ' '.join(words)

In [43]:
generate(model=cond_lstm_w2v, n=50)

'in this paper to and this language a a this be of of an be word we a of language in word language that a be paper to word a of and word word we be of this paper on to language a a paper for language a word we language of of'

In [44]:
generate(model=cond_lstm_w2v, n=100, prompt='a better method to')

'a better method to present of this an word language we the language this and this the paper in a and paper we a and an a the of word language that is of paper of an the word in be paper on to language the and word to a paper on language this and be a an this of an word the the and and a language for be paper in an an of be this language to present word word we an and and language for word this and and and be to word a an word and be paper we present'

## Evaluation