# Conditional LSTM

In [465]:
import random
import itertools

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from sklearn.decomposition import PCA

import preprocessing

from nltk.tokenize import RegexpTokenizer

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary

## Preprocessing

In [2]:
tokenized = preprocessing.tokenized

In [3]:
abstracts = []

for a in tokenized:
    joined = ' '.join(a)
    abstracts.append(joined)

In [4]:
avg_words = int(len(preprocessing.tokens)/len(abstracts))

In [5]:
seq_len = avg_words + 1

def sequence(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
seqs = list(sequence(preprocessing.tokens, seq_len))

In [6]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))

In [478]:
# create integer-to-token mapping
int2token = {}
counter = 0

for w in set(" ".join(abstracts).split()):
    int2token[counter] = w
    counter += 1

# create token-to-integer mapping
token2int = {t: a for a, t in int2token.items()}

token2int["the"], int2token[42]

(6643, 'bmw')

In [479]:
vocab_size = len(int2token)
vocab_size

37613

In [480]:
def get_integer_seq(seq):
    return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int_all = [get_integer_seq(i) for i in x]
y_int_all = [get_integer_seq(i) for i in y]

In [481]:
len(x_int_all),len(y_int_all)

(262490, 262490)

In [11]:
# delete all sequences not == len_seq

x_int = list(filter(lambda x: (len(x) == seq_len-1), x_int_all))
y_int = list(filter(lambda y: (len(y) == seq_len-1), y_int_all))

In [12]:
len(x_int),len(y_int)

(262489, 262489)

In [13]:
# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [14]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
        x = arr_x[prv:n]
        y = arr_y[prv:n]
        prv = n
        yield x, y

## Word2Vec embeddings

In [473]:
w2v_abs_model = KeyedVectors.load('word2vec_arxiv_abstracts.model', mmap='r')

In [474]:
vocab_size, emdedding_size = w2v_abs_model.wv.vectors.shape
print(vocab_size,emdedding_size)

37613 100


In [485]:
w2v_vectors = w2v_abs_model.wv.vectors

In [486]:
w2v_vectors.shape

(37613, 100)

In [487]:
w2v_tensors = torch.FloatTensor(w2v_vectors)

## Topic Modelling

### Corpus

In [442]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

KeyboardInterrupt: 

In [51]:
len(dct)

37582

In [52]:
print(dct)

Dictionary(37582 unique tokens: ['adaptation', 'address', 'albert', 'approach', 'art']...)


In [53]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [58]:
lsi = LsiModel(corpus, id2word=dct, num_topics=20, decay=0.2)

In [59]:
lsi.show_topics(20)

[(0,
  '0.862*"e" + 0.299*"de" + 0.155*"d" + 0.132*"la" + 0.123*"des" + 0.122*"les" + 0.101*"l" + 0.097*"et" + 0.088*"s" + 0.082*"le"'),
 (1,
  '0.218*"word" + 0.143*"using" + 0.138*"corpus" + 0.137*"it" + 0.134*"text" + 0.132*"information" + 0.128*"translation" + 0.123*"neural" + 0.122*"or" + 0.121*"system"'),
 (2,
  '-0.761*"word" + -0.264*"embeddings" + 0.249*"corpus" + -0.178*"words" + 0.125*"annotation" + 0.115*"system" + -0.093*"representations" + -0.091*"embedding" + 0.091*"translation" + 0.085*"text"'),
 (3,
  '0.619*"translation" + 0.287*"machine" + -0.248*"corpus" + 0.229*"neural" + 0.205*"nmt" + 0.157*"system" + -0.132*"annotation" + 0.122*"english" + -0.117*"text" + 0.099*"mt"'),
 (4,
  '0.458*"corpus" + 0.302*"word" + 0.289*"translation" + -0.195*"neural" + 0.171*"languages" + 0.160*"annotation" + -0.153*"learning" + 0.152*"english" + -0.129*"state" + -0.123*"network"'),
 (5,
  '-0.537*"sentiment" + 0.301*"corpus" + -0.269*"analysis" + 0.228*"semantic" + -0.140*"text" + -0

In [61]:
lsi.show_topic(18, topn=10)

[('features', -0.3812280153671461),
 ('domain', 0.3664241332007311),
 ('knowledge', -0.24539420945523566),
 ('it', 0.22406247594077064),
 ('words', 0.18685416476785296),
 ('system', 0.1808808794074729),
 ('using', -0.16605763582043478),
 ('parsing', 0.1259840150474611),
 ('annotation', -0.12515906329679294),
 ('information', 0.1075151959073353)]

In [62]:
lsi[corpus]

<gensim.interfaces.TransformedCorpus at 0x1d72a800358>

In [63]:
topic_representation = lsi.projection.u

In [65]:
topic_representation.shape

(37582, 20)

In [66]:
topic_vectors = torch.FloatTensor(topic_representation)

### PCA

In [None]:
trans_topics = np.transpose(topic_representation)
trans_topics.shape

In [None]:
pca = PCA(n_components=64)
pca_topics = pca.fit_transform(trans_topics)
pca_topics.shape

In [None]:
pca_trans = np.transpose(pca_topics)

In [None]:
pca_topics_reshaped = pca_trans.reshape(2, 32, 256)
type(pca_topics_reshaped)

## Conditioned LSTM

In [496]:
class ConditionedLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        ## define the LSTM
        self.lstm = nn.LSTM(100, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [497]:
n_hidden = 64
batch_size = 64
n_layers = 1

In [102]:
# LSA
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)
trans_topics = np.transpose(lsi.projection.u)

# PCA
pca_topics = PCA(n_components=n_layers * batch_size, svd_solver='full').fit_transform(trans_topics)
pca_trans = np.transpose(pca_topics)

In [498]:
# instantiate the model
model = ConditionedLSTM(n_hidden=n_hidden, n_layers=n_layers)

model.cpu()

print(model)

ConditionedLSTM(
  (emb_layer): Embedding(37613, 100)
  (lstm): LSTM(100, 64, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=64, out_features=37613, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [499]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to CPU
    model.cpu()
    
    counter = 0

    model.train()

    for e in range(epochs):
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # initialize hidden state
            h = model.init_hidden(batch_size)
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cpu(), targets.cpu()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1).long())

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{} -".format(e+1, epochs),
                    "Step: {} -".format(counter),
                    "Loss: {}".format(loss))

In [104]:
epochs = 1
path = 'weights/cond_lstm.pt'
loss = 0.2

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

  "type " + obj.__name__ + ". It won't be checked "


In [106]:
train(model, batch_size=batch_size, epochs=epochs, print_every=1)

Epoch: 1/1 - Step: 1 - Loss: 11.459970474243164
Epoch: 1/1 - Step: 2 - Loss: 11.45486831665039
Epoch: 1/1 - Step: 3 - Loss: 11.443846702575684
Epoch: 1/1 - Step: 4 - Loss: 11.432527542114258
Epoch: 1/1 - Step: 5 - Loss: 11.419405937194824
Epoch: 1/1 - Step: 6 - Loss: 11.404845237731934
Epoch: 1/1 - Step: 7 - Loss: 11.381332397460938
Epoch: 1/1 - Step: 8 - Loss: 11.355045318603516
Epoch: 1/1 - Step: 9 - Loss: 11.334622383117676
Epoch: 1/1 - Step: 10 - Loss: 11.306285858154297
Epoch: 1/1 - Step: 11 - Loss: 11.26911449432373
Epoch: 1/1 - Step: 12 - Loss: 11.223613739013672
Epoch: 1/1 - Step: 13 - Loss: 11.174215316772461
Epoch: 1/1 - Step: 14 - Loss: 11.122401237487793
Epoch: 1/1 - Step: 15 - Loss: 11.057987213134766
Epoch: 1/1 - Step: 16 - Loss: 10.976066589355469
Epoch: 1/1 - Step: 17 - Loss: 10.886181831359863
Epoch: 1/1 - Step: 18 - Loss: 10.78087329864502
Epoch: 1/1 - Step: 19 - Loss: 10.678793907165527
Epoch: 1/1 - Step: 20 - Loss: 10.543740272521973
Epoch: 1/1 - Step: 21 - Loss: 10

In [107]:
torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion
            }, path)

In [111]:
train(model, batch_size=batch_size, epochs=epochs, print_every=1)

Epoch: 1/1 - Step: 1 - Loss: 6.43322229385376
Epoch: 1/1 - Step: 2 - Loss: 6.422721862792969
Epoch: 1/1 - Step: 3 - Loss: 6.343315601348877


KeyboardInterrupt: 

In [514]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[token2int[t]]])
    inputs = torch.from_numpy(x)
  
    # push to CPU
    inputs = inputs.cpu()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top 3 values
    top_n_idx = p.argsort()[-5:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2,3,4],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return int2token[sampled_token_index], h

In [515]:
gen_batch_size = 1

In [502]:
# PCA
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [516]:
# function to generate text
def generate(model, size, prompt='in this paper'):
        
    # push to CPU
    model.cpu()
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    toks = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [520]:
generate(model, 50, 'in this paper')

'in this paper browsers sofc cnlvr sonante parlvote through flairnlp analyser tection delved ratmcu procedure editor seqs proactivity procedure editor editor please shows lipschitz lowerbound shows rodynamiques exercices implicit semdis hosting cmce vocaux sva ratmcu tection cmce narrativeqa sva ben cmce implicit criticisms cmce ben members hack hack hack scalar maple identifiers dsvi'