# Conditional LSTM

In [117]:
import os
import numpy as np

from pybtex.database import parse_file

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary

In [2]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [3]:
list(bib_data.entries.keys())[-1]

'lieberman-etal-1965-automatic'

In [4]:
len(list(bib_data.entries.keys()))

66113

In [5]:
bib_data.entries['lieberman-etal-1965-automatic'].fields['year']

'1965'

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract)
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [6]:
# eliminate stop words
def tokenize_input(input):
    # make everything lowercase
    input = input.lower()

    # use tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # filter out stopwords
    final = filter(lambda token: token not in stopwords.words('english'), tokens)
    
    # end result in final
    return " ".join(final)

In [None]:
for year in range(2016,2022):        
        with open('data/datasets/abstracts_%s.txt' %year) as abstr:
            lines = abstr.readlines()
            processed = tokenize_input(lines[0])
            
            # create individual year files
            y = open('data/datasets/%s.txt' %year, 'a')
            y.write(processed)
            
            # create all years file
            a = open('data/datasets/all.txt', 'a')
            a.write(processed)
            
            y.close()
            a.close()

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract + '\n')
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [7]:
f = open('data/datasets/abstracts_2016.txt')
text = f.read()
abstracts = text.split('\n')

In [8]:
trimmed = [remove_stopwords(a) for a in abstracts]
lowercase = [a.lower() for a in trimmed]

In [9]:
tokenizer = RegexpTokenizer(r'\w+')
tokenized = [tokenizer.tokenize(a) for a in lowercase]

In [143]:
# total number of tokenized abstracts
len(tokenized)

1755

In [144]:
# example of tokenized abstract
tokenized[2]

['this',
 'article',
 'proposes',
 'universal',
 'dependency',
 'annotation',
 'scheme',
 'mandarin',
 'chinese',
 'including',
 'pos',
 'tags',
 'dependency',
 'analysis',
 'we',
 'identify',
 'cases',
 'idiosyncrasy',
 'mandarin',
 'chinese',
 'difficult',
 'fit',
 'current',
 'schema',
 'mainly',
 'based',
 'descriptions',
 'indo',
 'european',
 'languages',
 'we',
 'discuss',
 'differences',
 'scheme',
 'stanford',
 'chinese',
 'dependencies',
 'chinese',
 'dependency',
 'treebank']

In [548]:
# single word
tokenized[0][4]

'annotation'

In [579]:
words = [word for abstract in tokenized for word in abstract] # could use itertools for improved performance

In [582]:
len(words)

162958

### Dictionary

In [530]:
dct = Dictionary(tokenized)
dct[42]

'newly'

In [531]:
len(dct)

13614

In [532]:
# trim dict at 1000 tokens
dct.filter_extremes(no_below=1, no_above=0.1, keep_n=1000)

In [533]:
len(dct)

1000

In [535]:
dct[42]

'layers'

In [541]:
print(dct)

Dictionary(1000 unique tokens: ['0', '4', 'according', 'annotating', 'case']...)


In [16]:
corpus = [dct.doc2bow(text) for text in tokenized]

### LDA (unordered)

In [17]:
lda = LdaModel(corpus, num_topics=10, id2word=dct)

In [18]:
lda.show_topics()

[(0,
  '0.023*"la" + 0.020*"des" + 0.016*"les" + 0.016*"l" + 0.015*"d" + 0.012*"r" + 0.011*"une" + 0.008*"le" + 0.008*"es" + 0.007*"du"'),
 (1,
  '0.010*"d" + 0.009*"sense" + 0.008*"l" + 0.007*"le" + 0.007*"wordnet" + 0.006*"terms" + 0.006*"lexicon" + 0.006*"les" + 0.006*"categories" + 0.005*"la"'),
 (2,
  '0.014*"les" + 0.013*"d" + 0.011*"nlp" + 0.009*"sentence" + 0.007*"des" + 0.007*"en" + 0.007*"alignment" + 0.006*"une" + 0.006*"time" + 0.006*"extraction"'),
 (3,
  '0.010*"sentiment" + 0.009*"chinese" + 0.008*"detection" + 0.007*"cross" + 0.006*"target" + 0.006*"tweets" + 0.005*"event" + 0.005*"arabic" + 0.005*"existing" + 0.005*"provide"'),
 (4,
  '0.010*"questions" + 0.009*"domain" + 0.009*"sentiment" + 0.009*"neural" + 0.008*"event" + 0.007*"document" + 0.007*"search" + 0.006*"online" + 0.006*"documents" + 0.006*"web"'),
 (5,
  '0.013*"d" + 0.012*"les" + 0.011*"des" + 0.010*"la" + 0.008*"entity" + 0.008*"named" + 0.007*"le" + 0.007*"en" + 0.007*"c" + 0.007*"dans"'),
 (6,
  '0.010

### Doc2Vec (unordered)

In [19]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized)]

In [21]:
doc2vec_model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=4)

### LSA (ordered)

In [22]:
lsi = LsiModel(corpus, id2word=dct, num_topics=10, decay=0.5)

In [23]:
lsi.show_topics()

[(0,
  '0.403*"d" + 0.369*"la" + 0.342*"les" + 0.332*"des" + 0.258*"l" + 0.232*"le" + 0.189*"r" + 0.181*"en" + 0.180*"une" + 0.177*"dans"'),
 (1,
  '-0.184*"sentiment" + -0.162*"neural" + -0.135*"classification" + -0.129*"domain" + -0.123*"sentence" + -0.116*"nlp" + -0.106*"relations" + -0.102*"network" + -0.100*"sentences" + -0.098*"0"'),
 (2,
  '-0.835*"sentiment" + -0.134*"chinese" + -0.133*"lexicon" + 0.114*"relations" + -0.113*"tutorial" + -0.109*"arabic" + -0.082*"tools" + -0.077*"lexicons" + -0.076*"social" + -0.072*"polarity"'),
 (3,
  '-0.536*"neural" + -0.280*"network" + -0.216*"attention" + -0.162*"sentence" + -0.157*"networks" + 0.139*"lexicon" + -0.133*"al" + 0.130*"arabic" + 0.112*"wordnet" + 0.112*"tools"'),
 (4,
  '0.616*"d" + -0.480*"les" + -0.315*"des" + 0.174*"nous" + 0.166*"une" + -0.139*"la" + 0.115*"sur" + 0.114*"l" + -0.103*"que" + 0.100*"article"'),
 (5,
  '-0.680*"la" + 0.497*"les" + 0.283*"d" + -0.188*"le" + -0.130*"parole" + -0.127*"du" + 0.112*"sont" + -0.10

In [25]:
lsi.show_topic(8, topn=20)

[('arabic', -0.4034451193802862),
 ('nlp', 0.2934368452795288),
 ('domain', -0.26109461691817776),
 ('lexicon', -0.23909680583390266),
 ('0', -0.15035794497924815),
 ('terms', -0.14177416563946646),
 ('applications', 0.12972419053767675),
 ('entity', -0.12925842883420413),
 ('named', -0.12435832177626265),
 ('chinese', 0.11888435328098947),
 ('lexicons', -0.11743091268468898),
 ('mt', -0.11389644348425909),
 ('dependency', 0.11088695819433446),
 ('relations', 0.11038807326066608),
 ('representation', 0.10541354821936533),
 ('tools', 0.10074511719805448),
 ('question', 0.0999808924339344),
 ('users', 0.09888998611455983),
 ('inference', 0.09794536455095064),
 ('translations', -0.09588008681166743)]

In [26]:
lsi[corpus]

<gensim.interfaces.TransformedCorpus at 0x2a27f02ec88>

In [40]:
topic_representation = lsi.projection.u

In [41]:
lsi.projection.s.shape

(10,)

In [43]:
topic_representation.shape

(1000, 10)

## LSTM from scratch #1 (OK)

### Dataset

In [798]:
import torch
import pandas as pd
from collections import Counter

class Dataset(torch.utils.data.Dataset):
    def __init__(self, words, dictionary, sequence_length=5): # TODO: incorporate dictionary
        self.words = words[:2000]
        self.uniq_words = self.get_uniq_words()
        self.sequence_length = sequence_length

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

### Model architecture

In [799]:
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset, lstm_size=256, emdedding_dim=256, num_layers=2, dropout=0.2):
        super(Model, self).__init__()
        self.lstm_size = lstm_size
        self.embedding_dim = emdedding_dim
        self.num_layers = num_layers
        self.sequence_length = dataset.sequence_length
        self.dropout = dropout

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=self.dropout,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, self.sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, self.sequence_length, self.lstm_size))

### Training

In [800]:
from torch import nn, optim
from torch.utils.data import DataLoader

def train(dataset, model, batch_size=128, max_epochs=3):
    model.train()

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(model.sequence_length)

        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            
            if batch % 50 == 0:
                print({ 'Epoch': epoch, 'Batch': batch, 'Loss': loss.item() })

### Generation

In [801]:
def generate(dataset, model, text, next_words=100):
    output = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(model.sequence_length)

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in output[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        output.append(dataset.index_to_word[word_index])

    return ' '.join(output)

In [802]:
text = 'in this paper we propose'

dataset = Dataset(words, dct, sequence_length=len(text.split(' ')))
model = Model(dataset)

train(dataset, model, max_epochs=2)

{'Epoch': 0, 'Batch': 0, 'Loss': 6.910434722900391}


KeyboardInterrupt: 

In [783]:
generate(dataset, model, text=text, next_words=10)

tensor([[10,  9,  7,  1, 62]])
299
tensor([[  9,   7,   1,  62, 299]])
674
tensor([[  7,   1,  62, 299, 674]])
209
tensor([[  1,  62, 299, 674, 209]])
316
tensor([[ 62, 299, 674, 209, 316]])
135
tensor([[299, 674, 209, 316, 135]])
667
tensor([[674, 209, 316, 135, 667]])
598
tensor([[209, 316, 135, 667, 598]])
871
tensor([[316, 135, 667, 598, 871]])
14
tensor([[135, 667, 598, 871,  14]])
254


'in this paper we propose good 04 current sources pos an both vocalization set generated'

In [771]:
dataset.index_to_word[62]

'propose'

## LSTM from scratch #2

In [487]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(LSTM, self).__init__()

        # Number of samples per time step
        self.batch_size = 1

        # Dimension of weight vectors
        self.hidden_dim = hidden_dim

        # Dimension of embedded tensor
        self.embedding_dim = 2

        # Vocabulary size
        self.input_size = input_size

        # Number of time steps
        self.sequence_len = 1
    
        # Initialize embedding layer
        self.embedding = nn.Embedding(self.input_size, self.embedding_dim, padding_idx=0)
    
        # Initialize LSTM Cell
        self.lstm_cell = nn.LSTMCell(self.embedding_dim, self.hidden_dim)
        
    def forward(self, x):
        
        x = torch.tensor(x).to(device).long()

        # batch_size x hidden_size
        hidden_state = torch.zeros(x.size(0), self.hidden_dim)
        cell_state = torch.zeros(x.size(0), self.hidden_dim)
        hidden_state_2 = torch.zeros(x.size(0), self.hidden_dim)
        cell_state_2 = torch.zeros(x.size(0), self.hidden_dim)

        # weights initialization
        torch.nn.init.xavier_normal_(hidden_state)
        torch.nn.init.xavier_normal_(cell_state)
        torch.nn.init.xavier_normal_(hidden_state_2)
        torch.nn.init.xavier_normal_(cell_state_2)

        # From idx to embedding
        out = self.embedding(x)

        # Prepare the shape for LSTMCell
        out = out.view(self.sequence_len, x.size(0), -1)
    
        # Unfolding LSTM
        # Last hidden_state will be used to feed the fully connected neural net
        for i in range(self.sequence_len):
            hidden_state, cell_state = self.lstm_cell_1(out[i], (hidden_state, cell_state))
            hidden_state_2, cell_state_2 = self.lstm_cell_2(hidden_state, (hidden_state_2, cell_state_2))
            
        # Last hidden state is passed through a fully connected neural net
        out = self.fully_connected(hidden_state_2)
    
        return out

In [488]:
# def forward(self, x):

#     # batch_size x hidden_size
#     hidden_state = torch.zeros(x.size(0), self.hidden_dim)
#     cell_state = torch.zeros(x.size(0), self.hidden_dim)
#     hidden_state_2 = torch.zeros(x.size(0), self.hidden_dim)
#     cell_state_2 = torch.zeros(x.size(0), self.hidden_dim)

#     # weights initialization
#     torch.nn.init.xavier_normal_(hidden_state)
#     torch.nn.init.xavier_normal_(cell_state)
#     torch.nn.init.xavier_normal_(hidden_state_2)
#     torch.nn.init.xavier_normal_(cell_state_2)

#     # From idx to embedding
#     out = self.embedding(x)

#     # Prepare the shape for LSTMCell
#     out = out.view(self.sequence_len, x.size(0), -1)
    
#     # Unfolding LSTM
#     # Last hidden_state will be used to feed the fully connected neural net
#     for i in range(self.sequence_len):
#         hidden_state, cell_state = self.lstm_cell_1(out[i], (hidden_state, cell_state))
#         hidden_state_2, cell_state_2 = self.lstm_cell_2(hidden_state, (hidden_state_2, cell_state_2))
        
#     # Last hidden state is passed through a fully connected neural net
#     out = self.fully_connected(hidden_state_2)
    
#     return out

In [489]:
model = LSTM(input_size=5, hidden_dim=10)

In [490]:
model

LSTM(
  (embedding): Embedding(5, 2, padding_idx=0)
  (lstm_cell): LSTMCell(2, 10)
)

In [491]:
n_layers = 1

In [492]:
inp = torch.randn(model.batch_size, model.sequence_len, model.input_size)
hidden_state = torch.randn(n_layers, model.batch_size, model.hidden_dim)
cell_state = torch.randn(n_layers, model.batch_size, model.hidden_dim)
hidden = (hidden_state, cell_state)

In [493]:
inp.shape

torch.Size([1, 1, 5])

In [494]:
hidden_state.shape

torch.Size([1, 1, 10])

In [495]:
cell_state.shape

torch.Size([1, 1, 10])

In [496]:
out = model(inp)



RuntimeError: index out of range: Tried to access index -2 out of table with 4 rows. at C:\Users\builder\AppData\Local\Temp\pip-req-build-0i480kur\aten\src\TH/generic/THTensorEvenMoreMath.cpp:418

In [316]:
input_dim = 5
hidden_dim = 10
n_layers = 1

lstm_layer = nn.LSTM(input_dim, hidden_dim)

batch_size = 1
seq_len = 1

inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

In [319]:
hidden

(tensor([[[-1.2030, -1.1694,  0.6893,  0.0536, -0.6025, -0.4385,  0.3766,
            1.4402, -0.6188, -0.2681]]]),
 tensor([[[-1.1527, -0.6507,  0.7959, -2.2271, -0.5886, -1.1727,  0.9629,
            0.2820, -0.3897, -0.5524]]]))

In [309]:
out, hidden = lstm_layer(inp, hidden)

In [310]:
out.shape

torch.Size([1, 1, 10])

### LSTM #3

In [525]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm1 = nn.LSTMCell(1, 51)
        self.lstm2 = nn.LSTMCell(51, 51)
        self.linear = nn.Linear(51, 1)

    def forward(self, input):
        outputs = []
        h_t = torch.zeros(input.size(0), 51, dtype=torch.double)
        c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
        h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
        c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)

        for input_t in input.split(1, dim=1):
            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs += [output]
        return outputs

In [526]:
model = LSTM()

In [527]:
model

LSTM(
  (lstm1): LSTMCell(1, 51)
  (lstm2): LSTMCell(51, 51)
  (linear): Linear(in_features=51, out_features=1, bias=True)
)