# Conditional LSTM

In [1]:
import random
import itertools

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from sklearn.decomposition import PCA

from nltk.tokenize import RegexpTokenizer

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



## Preprocessing

In [5]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [3]:
abstracts = []

for a in tokenized:
    joined = ' '.join(a)
    abstracts.append(joined)

In [6]:
avg_words = int(len(tokens)/len(abstracts))

In [7]:
seq_len = avg_words + 1

def sequence(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
seqs = list(sequence(tokens, seq_len))

In [None]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))

In [None]:
# create integer-to-token mapping
int2token = {}
counter = 0

for w in set(" ".join(abstracts).split()):
    int2token[counter] = w
    counter += 1

# create token-to-integer mapping
token2int = {t: a for a, t in int2token.items()}

token2int["the"], int2token[42]

In [None]:
vocab_size = len(int2token)
vocab_size

In [None]:
def get_integer_seq(seq):
    return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int_all = [get_integer_seq(i) for i in x]
y_int_all = [get_integer_seq(i) for i in y]

In [None]:
len(x_int_all),len(y_int_all)

In [None]:
# delete all sequences not == len_seq

x_int = list(filter(lambda x: (len(x) == seq_len-1), x_int_all))
y_int = list(filter(lambda y: (len(y) == seq_len-1), y_int_all))

In [None]:
len(x_int),len(y_int)

In [None]:
# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [None]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
        x = arr_x[prv:n]
        y = arr_y[prv:n]
        prv = n
        yield x, y

In [None]:
preprocessing.train_x

## Word2Vec embeddings

In [None]:
w2v_abs_model = KeyedVectors.load('word2vec_arxiv_abstracts.model', mmap='r')

In [None]:
vocab_size, emdedding_size = w2v_abs_model.wv.vectors.shape
print(vocab_size,emdedding_size)

In [None]:
w2v_vectors = w2v_abs_model.wv.vectors

In [None]:
w2v_vectors.shape

In [None]:
w2v_tensors = torch.FloatTensor(w2v_vectors)

## Topic Modelling

### Corpus

In [None]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

In [None]:
len(dct)

In [None]:
print(dct)

In [None]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [None]:
lsi = LsiModel(corpus, id2word=dct, num_topics=20, decay=0.2)

In [None]:
lsi.show_topics(20)

In [None]:
lsi.show_topic(18, topn=10)

In [None]:
lsi[corpus]

In [None]:
topic_representation = lsi.projection.u

In [None]:
topic_representation.shape

In [None]:
topic_vectors = torch.FloatTensor(topic_representation)

### PCA

In [None]:
trans_topics = np.transpose(topic_representation)
trans_topics.shape

In [None]:
pca = PCA(n_components=64)
pca_topics = pca.fit_transform(trans_topics)
pca_topics.shape

In [None]:
pca_trans = np.transpose(pca_topics)

In [None]:
pca_topics_reshaped = pca_trans.reshape(2, 32, 256)
type(pca_topics_reshaped)

## Conditioned LSTM

In [None]:
class ConditionedLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        ## define the LSTM
        self.lstm = nn.LSTM(100, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [None]:
n_hidden = 64
batch_size = 64
n_layers = 1

In [None]:
# LSA
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)
trans_topics = np.transpose(lsi.projection.u)

# PCA
pca_topics = PCA(n_components=n_layers * batch_size, svd_solver='full').fit_transform(trans_topics)
pca_trans = np.transpose(pca_topics)

In [None]:
# instantiate the model
model = ConditionedLSTM(n_hidden=n_hidden, n_layers=n_layers)

model.cpu()

print(model)

In [None]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to CPU
    model.cpu()
    
    counter = 0

    model.train()

    for e in range(epochs):
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # initialize hidden state
            h = model.init_hidden(batch_size)
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cpu(), targets.cpu()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1).long())

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{} -".format(e+1, epochs),
                    "Step: {} -".format(counter),
                    "Loss: {}".format(loss))

In [None]:
epochs = 1
path = 'weights/cond_lstm.pt'
loss = 0.2

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
train(model, batch_size=batch_size, epochs=epochs, print_every=1)

## Generation

In [None]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[token2int[t]]])
    inputs = torch.from_numpy(x)
  
    # push to CPU
    inputs = inputs.cpu()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top 3 values
    top_n_idx = p.argsort()[-5:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2,3,4],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return int2token[sampled_token_index], h

In [None]:
gen_batch_size = 1

# PCA
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [None]:
# function to generate text
def generate(model, size, prompt='in this paper'):
        
    # push to CPU
    model.cpu()
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    toks = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [None]:
generate(model, 50, 'in this paper')

## Evaluation