# Conditioned LSTM

In [339]:
import random

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from sklearn.decomposition import PCA

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary

In [294]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [320]:
x_arr = np.load('data/x.npy')
y_arr = np.load('data/y.npy')

In [314]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
        x = arr_x[prv:n]
        y = arr_y[prv:n]
        prv = n
        yield x, y

## Word2Vec embeddings

In [297]:
w2v_model = gensim.models.KeyedVectors.load("w2v.model", mmap='r')

In [332]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
vocab_size,emdedding_size

(9569, 100)

In [331]:
w2v_tensors = torch.FloatTensor(w2v_model.wv.vectors)

In [340]:
def word_to_id(word):
    return w2v_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_model.wv.index_to_key[id]

## Topic Modelling

### Corpus

In [302]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

In [303]:
len(dct)

2520

In [304]:
print(dct)

Dictionary(2520 unique tokens: ['across', 'all', 'annotation', 'arabic', 'baselines']...)


In [305]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [336]:
lsi = LsiModel(corpus, id2word=dct, num_topics=20, decay=0.2)

In [337]:
lsi.show_topics(20)

[(0,
  '0.863*"e" + 0.298*"de" + 0.161*"d" + 0.139*"des" + 0.119*"la" + 0.101*"l" + 0.098*"les" + 0.093*"r" + 0.091*"et" + 0.088*"s"'),
 (1,
  '0.225*"word" + 0.157*"using" + 0.135*"it" + 0.133*"text" + 0.126*"information" + 0.122*"performance" + 0.121*"languages" + 0.120*"these" + 0.119*"or" + 0.117*"learning"'),
 (2,
  '0.672*"word" + 0.259*"prediction" + 0.171*"speech" + 0.158*"part" + 0.138*"sequence" + 0.136*"words" + -0.130*"text" + 0.119*"using" + -0.092*"dataset" + -0.092*"domain"'),
 (3,
  '0.542*"translation" + 0.311*"languages" + 0.279*"english" + 0.220*"machine" + 0.194*"corpus" + -0.160*"knowledge" + -0.141*"information" + 0.132*"resource" + 0.125*"mt" + 0.102*"parallel"'),
 (4,
  '-0.258*"word" + -0.218*"translation" + 0.202*"corpus" + -0.175*"neural" + -0.155*"embeddings" + -0.153*"sentence" + 0.146*"system" + 0.143*"speech" + -0.140*"propose" + 0.132*"it"'),
 (5,
  '0.429*"languages" + -0.395*"translation" + -0.175*"information" + -0.173*"system" + -0.155*"machine" + 0.

In [338]:
lsi.show_topic(2, topn=10)

[('word', 0.6718268660364355),
 ('prediction', 0.2587625932581793),
 ('speech', 0.17050256601405472),
 ('part', 0.15848019955709758),
 ('sequence', 0.13807435632553952),
 ('words', 0.1362266850580958),
 ('text', -0.13036403772437627),
 ('using', 0.1193112840138552),
 ('dataset', -0.09209859443510808),
 ('domain', -0.09200113501132114)]

## Conditioned LSTM

In [308]:
n_hidden = 256
batch_size = 64
n_layers = 3

In [306]:
class ConditionedLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        vocab_size = len(w2v_model.wv)
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        ## define the LSTM
        self.lstm = nn.LSTM(100, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

### Latent Semantic Analysis (LSA)

In [334]:
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)
trans_topics = np.transpose(lsi.projection.u)

### Principal Component Analysis (PCA)

In [309]:
pca_topics = PCA(n_components=n_layers*batch_size, svd_solver='full').fit_transform(trans_topics)
pca_trans = np.transpose(pca_topics)

In [310]:
# instantiate the model
model = ConditionedLSTM(n_hidden=n_hidden, n_layers=n_layers)

model.cpu()

print(model)

ConditionedLSTM(
  (emb_layer): Embedding(9569, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=9569, bias=True)
)


In [322]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to CPU
    model.cpu()
    
    counter = 0

    model.train()

    for e in range(epochs):
        
        for x, y in get_batches(x_arr, y_arr, batch_size):
            counter+= 1
            
            # initialize hidden state
            h = model.init_hidden(batch_size)
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cpu(), targets.cpu()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1).long())

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{} -".format(e+1, epochs),
                    "Step: {} -".format(counter),
                    "Loss: {}".format(loss))

In [323]:
epochs = 20
path = 'weights/cond_lstm.pt'
loss = 0.2

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [324]:
train(model, batch_size=batch_size, epochs=1, print_every=4)

Epoch: 1/1 - Step: 4 - Loss: 8.72928237915039
Epoch: 1/1 - Step: 8 - Loss: 8.014249801635742
Epoch: 1/1 - Step: 12 - Loss: 7.408889293670654
Epoch: 1/1 - Step: 16 - Loss: 7.175367832183838
Epoch: 1/1 - Step: 20 - Loss: 6.928103446960449


In [239]:
train(model, batch_size=batch_size, epochs=epochs, print_every=4)

Epoch: 1/20 - Step: 4 - Loss: 9.094103813171387
Epoch: 1/20 - Step: 8 - Loss: 8.207086563110352
Epoch: 1/20 - Step: 12 - Loss: 7.28662109375
Epoch: 1/20 - Step: 16 - Loss: 7.0488996505737305
Epoch: 1/20 - Step: 20 - Loss: 6.928389072418213
Epoch: 2/20 - Step: 24 - Loss: 6.978840351104736
Epoch: 2/20 - Step: 28 - Loss: 6.922019958496094
Epoch: 2/20 - Step: 32 - Loss: 6.974577903747559
Epoch: 2/20 - Step: 36 - Loss: 6.861565589904785
Epoch: 2/20 - Step: 40 - Loss: 6.821847438812256
Epoch: 2/20 - Step: 44 - Loss: 6.800374507904053
Epoch: 3/20 - Step: 48 - Loss: 7.03978157043457
Epoch: 3/20 - Step: 52 - Loss: 6.947199821472168
Epoch: 3/20 - Step: 56 - Loss: 6.8963847160339355
Epoch: 3/20 - Step: 60 - Loss: 6.830890655517578
Epoch: 3/20 - Step: 64 - Loss: 6.849308013916016
Epoch: 3/20 - Step: 68 - Loss: 7.050714492797852
Epoch: 4/20 - Step: 72 - Loss: 6.874112606048584
Epoch: 4/20 - Step: 76 - Loss: 7.313183784484863
Epoch: 4/20 - Step: 80 - Loss: 6.838290214538574
Epoch: 4/20 - Step: 84 - 

In [240]:
torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion
            }, path)

  "type " + obj.__name__ + ". It won't be checked "


In [326]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[word_to_id(t)]])
    inputs = torch.from_numpy(x)
  
    # push to CPU
    inputs = inputs.cpu()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top 3 values
    top_n_idx = p.argsort()[-5:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2,3,4],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return id_to_word(sampled_token_index), h

In [327]:
gen_batch_size = 1

In [328]:
# PCA
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [329]:
# function to generate text
def generate(model, size, prompt='in this paper'):
        
    # push to CPU
    model.cpu()
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    toks = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [330]:
generate(model, 20)

'in this paper did did regions did worse did universal worse universal regions demo did worse regions worse did worse universal demo demo'