# Conditional LSTM

In [1]:
import os
import numpy as np

from pybtex.database import parse_file

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary



In [None]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [None]:
list(bib_data.entries.keys())[-1]

In [None]:
len(list(bib_data.entries.keys()))

In [None]:
bib_data.entries['lieberman-etal-1965-automatic'].fields['year']

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract)
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [None]:
# eliminate stop words
def tokenize_input(input):
    # make everything lowercase
    input = input.lower()

    # use tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # filter out stopwords
    final = filter(lambda token: token not in stopwords.words('english'), tokens)
    
    # end result in final
    return " ".join(final)

In [None]:
for year in range(2016,2022):        
        with open('data/datasets/abstracts_%s.txt' %year) as abstr:
            lines = abstr.readlines()
            processed = tokenize_input(lines[0])
            
            # create individual year files
            y = open('data/datasets/%s.txt' %year, 'a')
            y.write(processed)
            
            # create all years file
            a = open('data/datasets/all.txt', 'a')
            a.write(processed)
            
            y.close()
            a.close()

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract + '\n')
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [2]:
f = open('data/datasets/abstracts_2016.txt')
text = f.read()
abstracts = text.split('\n')

In [3]:
trimmed = [remove_stopwords(a) for a in abstracts]
lowercase = [a.lower() for a in trimmed]

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
tokenized = [tokenizer.tokenize(a) for a in lowercase]

In [5]:
# total number of tokenized abstracts
len(tokenized)

1755

In [6]:
# example of tokenized abstract
tokenized[2]

['this',
 'article',
 'proposes',
 'universal',
 'dependency',
 'annotation',
 'scheme',
 'mandarin',
 'chinese',
 'including',
 'pos',
 'tags',
 'dependency',
 'analysis',
 'we',
 'identify',
 'cases',
 'idiosyncrasy',
 'mandarin',
 'chinese',
 'difficult',
 'fit',
 'current',
 'schema',
 'mainly',
 'based',
 'descriptions',
 'indo',
 'european',
 'languages',
 'we',
 'discuss',
 'differences',
 'scheme',
 'stanford',
 'chinese',
 'dependencies',
 'chinese',
 'dependency',
 'treebank']

In [7]:
# single word
tokenized[0][4]

'annotation'

In [8]:
words = [word for abstract in tokenized for word in abstract] # could use itertools for improved performance

In [9]:
len(words)

162958

### Dictionary

In [10]:
dct = Dictionary(tokenized)
dct[42]

'newly'

In [11]:
len(dct)

13614

In [12]:
# trim dict at 1000 tokens
dct.filter_extremes(no_below=1, no_above=0.1, keep_n=1000)

In [13]:
len(dct)

1000

In [14]:
dct[42]

'layers'

In [15]:
print(dct)

Dictionary(1000 unique tokens: ['0', '4', 'according', 'annotating', 'case']...)


In [16]:
corpus = [dct.doc2bow(text) for text in tokenized]

### LDA (unordered)

In [None]:
lda = LdaModel(corpus, num_topics=10, id2word=dct)

In [None]:
lda.show_topics()

### Doc2Vec (unordered)

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized)]

In [None]:
doc2vec_model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=4)

### LSA (ordered)

In [None]:
lsi = LsiModel(corpus, id2word=dct, num_topics=10, decay=0.5)

In [None]:
lsi.show_topics()

In [None]:
lsi.show_topic(8, topn=20)

In [None]:
lsi[corpus]

In [None]:
topic_representation = lsi.projection.u

In [None]:
lsi.projection.s.shape

In [None]:
topic_representation.shape

## LSTM from scratch #1 (OK)

### Dataset

In [17]:
import torch
import pandas as pd
from collections import Counter

class Dataset(torch.utils.data.Dataset):
    def __init__(self, words, dictionary, sequence_length=5): # TODO: incorporate dictionary
        self.words = words[:2000]
        self.uniq_words = self.get_uniq_words()
        self.sequence_length = sequence_length

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

### Model architecture

In [18]:
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset, lstm_size=256, emdedding_dim=256, num_layers=2, dropout=0.2):
        super(Model, self).__init__()
        self.lstm_size = lstm_size
        self.embedding_dim = emdedding_dim
        self.num_layers = num_layers
        self.sequence_length = dataset.sequence_length
        self.dropout = dropout

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=self.dropout,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, self.sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, self.sequence_length, self.lstm_size))

### Training

In [19]:
from torch import nn, optim
from torch.utils.data import DataLoader

def train(dataset, model, batch_size=128, max_epochs=3):
    model.train()

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(model.sequence_length)

        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            
            if batch % 50 == 0:
                print({ 'Epoch': epoch, 'Batch': batch, 'Loss': loss.item() })

### Generation

In [20]:
def generate(dataset, model, text, next_words=100):
    output = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(model.sequence_length)

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in output[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        output.append(dataset.index_to_word[word_index])

    return ' '.join(output)

In [None]:
text = 'in this paper we propose'

dataset = Dataset(words, dct, sequence_length=len(text.split(' ')))
model = Model(dataset)

train(dataset, model, max_epochs=2)

{'Epoch': 0, 'Batch': 0, 'Loss': 6.907998085021973}


In [None]:
generate(dataset, model, text=text, next_words=10)

In [None]:
dataset.index_to_word[62]

## LSTM from scratch #2

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim):
        super(LSTM, self).__init__()

        # Number of samples per time step
        self.batch_size = 1

        # Dimension of weight vectors
        self.hidden_dim = hidden_dim

        # Dimension of embedded tensor
        self.embedding_dim = 2

        # Vocabulary size
        self.input_size = input_size

        # Number of time steps
        self.sequence_len = 1
    
        # Initialize embedding layer
        self.embedding = nn.Embedding(self.input_size, self.embedding_dim, padding_idx=0)
    
        # Initialize LSTM Cell
        self.lstm_cell = nn.LSTMCell(self.embedding_dim, self.hidden_dim)
        
    def forward(self, x):
        
        x = torch.tensor(x).to(device).long()

        # batch_size x hidden_size
        hidden_state = torch.zeros(x.size(0), self.hidden_dim)
        cell_state = torch.zeros(x.size(0), self.hidden_dim)
        hidden_state_2 = torch.zeros(x.size(0), self.hidden_dim)
        cell_state_2 = torch.zeros(x.size(0), self.hidden_dim)

        # weights initialization
        torch.nn.init.xavier_normal_(hidden_state)
        torch.nn.init.xavier_normal_(cell_state)
        torch.nn.init.xavier_normal_(hidden_state_2)
        torch.nn.init.xavier_normal_(cell_state_2)

        # From idx to embedding
        out = self.embedding(x)

        # Prepare the shape for LSTMCell
        out = out.view(self.sequence_len, x.size(0), -1)
    
        # Unfolding LSTM
        # Last hidden_state will be used to feed the fully connected neural net
        for i in range(self.sequence_len):
            hidden_state, cell_state = self.lstm_cell_1(out[i], (hidden_state, cell_state))
            hidden_state_2, cell_state_2 = self.lstm_cell_2(hidden_state, (hidden_state_2, cell_state_2))
            
        # Last hidden state is passed through a fully connected neural net
        out = self.fully_connected(hidden_state_2)
    
        return out

In [None]:
# def forward(self, x):

#     # batch_size x hidden_size
#     hidden_state = torch.zeros(x.size(0), self.hidden_dim)
#     cell_state = torch.zeros(x.size(0), self.hidden_dim)
#     hidden_state_2 = torch.zeros(x.size(0), self.hidden_dim)
#     cell_state_2 = torch.zeros(x.size(0), self.hidden_dim)

#     # weights initialization
#     torch.nn.init.xavier_normal_(hidden_state)
#     torch.nn.init.xavier_normal_(cell_state)
#     torch.nn.init.xavier_normal_(hidden_state_2)
#     torch.nn.init.xavier_normal_(cell_state_2)

#     # From idx to embedding
#     out = self.embedding(x)

#     # Prepare the shape for LSTMCell
#     out = out.view(self.sequence_len, x.size(0), -1)
    
#     # Unfolding LSTM
#     # Last hidden_state will be used to feed the fully connected neural net
#     for i in range(self.sequence_len):
#         hidden_state, cell_state = self.lstm_cell_1(out[i], (hidden_state, cell_state))
#         hidden_state_2, cell_state_2 = self.lstm_cell_2(hidden_state, (hidden_state_2, cell_state_2))
        
#     # Last hidden state is passed through a fully connected neural net
#     out = self.fully_connected(hidden_state_2)
    
#     return out

In [None]:
model = LSTM(input_size=5, hidden_dim=10)

In [None]:
model

In [None]:
n_layers = 1

In [None]:
inp = torch.randn(model.batch_size, model.sequence_len, model.input_size)
hidden_state = torch.randn(n_layers, model.batch_size, model.hidden_dim)
cell_state = torch.randn(n_layers, model.batch_size, model.hidden_dim)
hidden = (hidden_state, cell_state)

In [None]:
inp.shape

In [None]:
hidden_state.shape

In [None]:
cell_state.shape

In [None]:
out = model(inp)

In [None]:
input_dim = 5
hidden_dim = 10
n_layers = 1

lstm_layer = nn.LSTM(input_dim, hidden_dim)

batch_size = 1
seq_len = 1

inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

In [None]:
hidden

In [None]:
out, hidden = lstm_layer(inp, hidden)

In [None]:
out.shape

### LSTM #3

In [None]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm1 = nn.LSTMCell(1, 51)
        self.lstm2 = nn.LSTMCell(51, 51)
        self.linear = nn.Linear(51, 1)

    def forward(self, input):
        outputs = []
        h_t = torch.zeros(input.size(0), 51, dtype=torch.double)
        c_t = torch.zeros(input.size(0), 51, dtype=torch.double)
        h_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)
        c_t2 = torch.zeros(input.size(0), 51, dtype=torch.double)

        for input_t in input.split(1, dim=1):
            h_t, c_t = self.lstm1(input_t, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs += [output]
        return outputs

In [None]:
model = LSTM()

In [None]:
model