# Conditional LSTM

In [1]:
import random
import itertools

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from sklearn.decomposition import PCA

from nltk.tokenize import RegexpTokenizer

import gensim
from gensim.models import KeyedVectors
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



## Preprocessing

In [2]:
with open('data/tokenized.txt','r') as f:
    tokenized = eval(f.read())
    
with open('data/tokens.txt','r') as f:
    tokens = eval(f.read())

In [3]:
abstracts = []

for a in tokenized:
    joined = ' '.join(a)
    abstracts.append(joined)

In [4]:
avg_words = int(len(preprocessing.tokens)/len(abstracts))

In [186]:
seq_len = 100

def sequence(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]
        
seqs = list(sequence(tokens, seq_len))
len(seqs)

1486

In [175]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
    x.append(" ".join(s[:-1]))
    y.append(" ".join(s[1:]))

In [179]:
abstracts[-1]

'for humans the main functions of a dictionary is to store information concerning words and to reveal it when needed while readers are interested in the meaning of words writers look for answers concerning usage spelling grammar or word forms lemma we will focus here on this latter task help authors to find the word they are looking for word they may know but whose form is eluding them put differently we try to build a resource helping authors to overcome the tip of the tongue problem tot obviously in order to access a word it must be stored somewhere brain resource yet this is by no means sufficient we will illustrate this here by comparing wordnet wn to an equivalent lexical resource bootstrapped from wikipedia wipi both may contain a given word but ease and success of access may be different depending on other factors like quality of the query proximity type of connections etc next we will show under what conditions wn is suitable for word access and finally we will present a roadma

In [180]:
len(set(tokens))

9569

In [148]:
# create integer-to-token mapping
int2token = {}
counter = 0

for w in set(" ".join(abstracts).split()):
    int2token[counter] = w
    counter += 1

# create token-to-integer mapping
token2int = {t: a for a, t in int2token.items()}

token2int["the"], int2token[42]

(2963, 'signals')

In [149]:
vocab_size = len(int2token)
vocab_size

9569

In [150]:
def get_integer_seq(seq):
    return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int_all = [get_integer_seq(i) for i in x]
y_int_all = [get_integer_seq(i) for i in y]

In [151]:
len(x_int_all),len(y_int_all)

(1486, 1486)

In [152]:
# delete all sequences not == len_seq

x_int = list(filter(lambda x: (len(x) == seq_len-1), x_int_all))
y_int = list(filter(lambda y: (len(y) == seq_len-1), y_int_all))

In [153]:
len(x_int),len(y_int)

(1485, 1485)

In [154]:
# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [155]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
        x = arr_x[prv:n]
        y = arr_y[prv:n]
        prv = n
        yield x, y

## Word2Vec embeddings

In [183]:
# TODO: adjust sentence_len to avg of abstract length
sentence_len = 100

sent_split = zip(*[iter(tokens)] * sentence_len)
sentences = [list(s) for s in sent_split]

In [187]:
len(seqs)

1486

In [188]:
w2v_model = gensim.models.Word2Vec(seqs, vector_size=100, min_count=1, epochs=1)

In [189]:
vocab_size, emdedding_size = w2v_model.wv.vectors.shape
print(vocab_size,emdedding_size)

9569 100


In [190]:
w2v_vectors = w2v_model.wv.vectors

In [191]:
w2v_vectors.shape

(9569, 100)

In [192]:
w2v_tensors = torch.FloatTensor(w2v_vectors)

## Topic Modelling

### Corpus

In [161]:
dct = Dictionary(tokenized)
dct.filter_extremes(no_below=5, no_above=0.3)

In [162]:
len(dct)

2520

In [163]:
print(dct)

Dictionary(2520 unique tokens: ['across', 'all', 'annotation', 'arabic', 'baselines']...)


In [164]:
corpus = [dct.doc2bow(a) for a in tokenized]

### Latent Semantic Analysis (LSA)

In [58]:
lsi = LsiModel(corpus, id2word=dct, num_topics=20, decay=0.2)

In [59]:
lsi.show_topics(20)

[(0,
  '0.862*"e" + 0.299*"de" + 0.155*"d" + 0.132*"la" + 0.123*"des" + 0.122*"les" + 0.101*"l" + 0.097*"et" + 0.088*"s" + 0.082*"le"'),
 (1,
  '0.218*"word" + 0.143*"using" + 0.138*"corpus" + 0.137*"it" + 0.134*"text" + 0.132*"information" + 0.128*"translation" + 0.123*"neural" + 0.122*"or" + 0.121*"system"'),
 (2,
  '-0.761*"word" + -0.264*"embeddings" + 0.249*"corpus" + -0.178*"words" + 0.125*"annotation" + 0.115*"system" + -0.093*"representations" + -0.091*"embedding" + 0.091*"translation" + 0.085*"text"'),
 (3,
  '0.619*"translation" + 0.287*"machine" + -0.248*"corpus" + 0.229*"neural" + 0.205*"nmt" + 0.157*"system" + -0.132*"annotation" + 0.122*"english" + -0.117*"text" + 0.099*"mt"'),
 (4,
  '0.458*"corpus" + 0.302*"word" + 0.289*"translation" + -0.195*"neural" + 0.171*"languages" + 0.160*"annotation" + -0.153*"learning" + 0.152*"english" + -0.129*"state" + -0.123*"network"'),
 (5,
  '-0.537*"sentiment" + 0.301*"corpus" + -0.269*"analysis" + 0.228*"semantic" + -0.140*"text" + -0

In [61]:
lsi.show_topic(18, topn=10)

[('features', -0.3812280153671461),
 ('domain', 0.3664241332007311),
 ('knowledge', -0.24539420945523566),
 ('it', 0.22406247594077064),
 ('words', 0.18685416476785296),
 ('system', 0.1808808794074729),
 ('using', -0.16605763582043478),
 ('parsing', 0.1259840150474611),
 ('annotation', -0.12515906329679294),
 ('information', 0.1075151959073353)]

In [62]:
lsi[corpus]

<gensim.interfaces.TransformedCorpus at 0x1d72a800358>

In [63]:
topic_representation = lsi.projection.u

In [65]:
topic_representation.shape

(37582, 20)

In [66]:
topic_vectors = torch.FloatTensor(topic_representation)

### PCA

In [None]:
trans_topics = np.transpose(topic_representation)
trans_topics.shape

In [None]:
pca = PCA(n_components=64)
pca_topics = pca.fit_transform(trans_topics)
pca_topics.shape

In [None]:
pca_trans = np.transpose(pca_topics)

In [None]:
pca_topics_reshaped = pca_trans.reshape(2, 32, 256)
type(pca_topics_reshaped)

## Conditioned LSTM

In [203]:
class ConditionedLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.001):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        vocab_size = len(w2v_model.wv)
        
        self.emb_layer = nn.Embedding.from_pretrained(w2v_tensors)

        ## define the LSTM
        self.lstm = nn.LSTM(100, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''

        hidden = (torch.FloatTensor(pca_trans.reshape(self.n_layers, batch_size, self.n_hidden)),
                 torch.ones(self.n_layers, batch_size, self.n_hidden))

        return hidden

In [234]:
n_hidden = 256
batch_size = 64
n_layers = 3

In [235]:
# LSA
lsi = LsiModel(corpus, id2word=dct, num_topics=n_hidden, decay=0.2)
trans_topics = np.transpose(lsi.projection.u)

# PCA|
pca_topics = PCA(n_components=n_layers*batch_size, svd_solver='full').fit_transform(trans_topics)
pca_trans = np.transpose(pca_topics)

In [236]:
# instantiate the model
model = ConditionedLSTM(n_hidden=n_hidden, n_layers=n_layers)

model.cpu()

print(model)

ConditionedLSTM(
  (emb_layer): Embedding(9569, 100)
  (lstm): LSTM(100, 256, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=9569, bias=True)
)


In [237]:
def train(model, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to CPU
    model.cpu()
    
    counter = 0

    model.train()

    for e in range(epochs):
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # initialize hidden state
            h = model.init_hidden(batch_size)
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cpu(), targets.cpu()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1).long())

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{} -".format(e+1, epochs),
                    "Step: {} -".format(counter),
                    "Loss: {}".format(loss))

In [238]:
epochs = 20
path = 'weights/cond_lstm.pt'
loss = 0.2

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [239]:
train(model, batch_size=batch_size, epochs=epochs, print_every=4)

Epoch: 1/20 - Step: 4 - Loss: 9.094103813171387
Epoch: 1/20 - Step: 8 - Loss: 8.207086563110352
Epoch: 1/20 - Step: 12 - Loss: 7.28662109375
Epoch: 1/20 - Step: 16 - Loss: 7.0488996505737305
Epoch: 1/20 - Step: 20 - Loss: 6.928389072418213
Epoch: 2/20 - Step: 24 - Loss: 6.978840351104736
Epoch: 2/20 - Step: 28 - Loss: 6.922019958496094
Epoch: 2/20 - Step: 32 - Loss: 6.974577903747559
Epoch: 2/20 - Step: 36 - Loss: 6.861565589904785
Epoch: 2/20 - Step: 40 - Loss: 6.821847438812256
Epoch: 2/20 - Step: 44 - Loss: 6.800374507904053
Epoch: 3/20 - Step: 48 - Loss: 7.03978157043457
Epoch: 3/20 - Step: 52 - Loss: 6.947199821472168
Epoch: 3/20 - Step: 56 - Loss: 6.8963847160339355
Epoch: 3/20 - Step: 60 - Loss: 6.830890655517578
Epoch: 3/20 - Step: 64 - Loss: 6.849308013916016
Epoch: 3/20 - Step: 68 - Loss: 7.050714492797852
Epoch: 4/20 - Step: 72 - Loss: 6.874112606048584
Epoch: 4/20 - Step: 76 - Loss: 7.313183784484863
Epoch: 4/20 - Step: 80 - Loss: 6.838290214538574
Epoch: 4/20 - Step: 84 - 

In [240]:
torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': criterion
            }, path)

  "type " + obj.__name__ + ". It won't be checked "


In [228]:
# predict next token
def predict(model, t, h=None): # default value as None for first iteration
         
    # tensor inputs
    x = np.array([[token2int[t]]])
    inputs = torch.from_numpy(x)
  
    # push to CPU
    inputs = inputs.cpu()

    # detach hidden state from history
    h = tuple([each.data for each in h])

    # get the output of the model
    out, h = model(inputs, h)

    # get the token probabilities
    p = F.softmax(out, dim=1).data

    p = p.cpu()

    p = p.numpy()
    p = p.reshape(p.shape[1],)

    # get indices of top 3 values
    top_n_idx = p.argsort()[-5:][::-1]

    # randomly select one of the three indices
    sampled_token_index = top_n_idx[random.sample([0,1,2,3,4],1)[0]]

    # return the encoded value of the predicted char and the hidden state
    return int2token[sampled_token_index], h

In [229]:
gen_batch_size = 1

In [230]:
# PCA
gen_pca_topics = PCA(n_components=n_layers * gen_batch_size, svd_solver='full').fit_transform(trans_topics)
gen_pca_trans = np.transpose(gen_pca_topics)

In [231]:
# function to generate text
def generate(model, size, prompt='in this paper'):
        
    # push to CPU
    model.cpu()
    
    model.eval()
    
    h = (torch.FloatTensor(gen_pca_trans.reshape(n_layers, gen_batch_size, n_hidden)),
         torch.ones(n_layers, gen_batch_size, n_hidden))

    toks = prompt.split()

    # predict next token
    for t in prompt.split():
        token, h = predict(model, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [233]:
generate(model, 50, 'in this paper we')

'in this paper we to a to a the to a of and of a to the to a a of to to the the of and the the a the the the to a a and the of a the of a and a of a to and to the and the of'