# Conditional LSTM

In [18]:
import os
import torch
import numpy as np
import pandas as pd

from pybtex.database import parse_file

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel
from gensim.corpora.dictionary import Dictionary

### Parse bibtex files

In [None]:
bib_data = parse_file('data/anthology+abstracts.bib')

In [None]:
list(bib_data.entries.keys())[-1]

In [None]:
len(list(bib_data.entries.keys()))

In [None]:
bib_data.entries['lieberman-etal-1965-automatic'].fields['year']

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract)
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [None]:
# eliminate stop words
def tokenize_input(input):
    # make everything lowercase
    input = input.lower()

    # use tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # filter out stopwords
    final = filter(lambda token: token not in stopwords.words('english'), tokens)
    
    # end result in final
    return " ".join(final)

In [None]:
for year in range(2016,2022):        
        with open('data/datasets/abstracts_%s.txt' %year) as abstr:
            lines = abstr.readlines()
            processed = tokenize_input(lines[0])
            
            # create individual year files
            y = open('data/datasets/%s.txt' %year, 'a')
            y.write(processed)
            
            # create all years file
            a = open('data/datasets/all.txt', 'a')
            a.write(processed)
            
            y.close()
            a.close()

In [None]:
for k in bib_data.entries.keys():
    try:
        year = bib_data.entries[k].fields['year']
        abstract = bib_data.entries[k].fields['abstract']
        
        if year > '2015':
            f = open('data/datasets/abstracts_%s.txt' %year, 'a')
            f.write(abstract + '\n')
            f.close()
            
    except (KeyError, UnicodeEncodeError): # entries without abstracts are excluded
        pass

In [1236]:
f = open('data/datasets/abstracts_2020.txt')
text = f.read()
abstracts_2020 = text.split('\n')

In [1237]:
trimmed = [remove_stopwords(a) for a in abstracts_2020]
lowercase = [a.lower() for a in trimmed]

In [1238]:
tokenizer = RegexpTokenizer(r'\w+')
tokenized = [tokenizer.tokenize(a) for a in lowercase]

In [1239]:
# total number of tokenized abstracts
len(tokenized)

6499

In [1241]:
# example of tokenized abstract
tokenized[:2]

[['the',
  'relatedness',
  'research',
  'articles',
  'patents',
  'court',
  'rulings',
  'web',
  'pages',
  'document',
  'types',
  'calculated',
  'citation',
  'hyperlink',
  'based',
  'approaches',
  'like',
  'co',
  'citation',
  'proximity',
  'analysis',
  'the',
  'main',
  'limitation',
  'citation',
  'based',
  'approaches',
  'documents',
  'receive',
  'little',
  'citations',
  'we',
  'propose',
  'virtual',
  'citation',
  'proximity',
  'vcp',
  'siamese',
  'neural',
  'network',
  'architecture',
  'combines',
  'advantages',
  'co',
  'citation',
  'proximity',
  'analysis',
  'diverse',
  'notions',
  'relatedness',
  'high',
  'recommendation',
  'performance',
  'advantage',
  'content',
  'based',
  'filtering',
  'high',
  'coverage',
  'vcp',
  'trained',
  'corpus',
  'documents',
  'textual',
  'features',
  'real',
  'citation',
  'proximity',
  'ground',
  'truth',
  'vcp',
  'predicts',
  'documents',
  'based',
  'title',
  'abstract',
  'proximit

In [397]:
# single word
tokenized[0][4]

'patents'

In [398]:
words = [word for abstract in tokenized for word in abstract] # could use itertools to improve performance

In [399]:
len(words)

606420

## Conditioned LSTM with Word2Vec embeddings #1

### Dataset

In [551]:
from collections import Counter

class Dataset(torch.utils.data.Dataset):
    def __init__(self, words, sequence_length=5): # TODO: incorporate dictionary
        self.words = words[:2000]
        self.uniq_words = self.get_uniq_words()
        self.sequence_length = sequence_length

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),
        )

### Embeddings

In [592]:
f = open('data/datasets/abstracts_2020.txt')
text = f.read()
abstracts = text.split('\n')

In [599]:
words = [word for abstract in tokenized for word in abstract] # to fix

In [1193]:
len(words)

360959

In [553]:
import gensim

w2v_model = gensim.models.Word2Vec(abstracts, min_count=1, vector_size=256, epochs=5)

In [554]:
vectors = w2v_model.wv.vectors[:928]

In [555]:
vectors.shape

(928, 256)

In [556]:
weights = torch.FloatTensor(vectors)

### Model architecture

In [557]:
class Model(nn.Module):
    def __init__(self, dataset, lstm_size=256, emdedding_dim=256, num_layers=2, dropout=0.2):
        super(Model, self).__init__()
        self.lstm_size = lstm_size
        self.embedding_dim = emdedding_dim
        self.num_layers = num_layers
        self.sequence_length = dataset.sequence_length
        self.dropout = dropout

        n_vocab = len(dataset.uniq_words)
        
        self.embedding = nn.Embedding.from_pretrained(weights)
        
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=self.dropout,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, self.sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, self.sequence_length, self.lstm_size))

### Training

In [558]:
from torch import nn, optim
from torch.utils.data import DataLoader

def train(dataset, model, batch_size=128, max_epochs=3):
    model.train()

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.02)

    for epoch in range(max_epochs):
        state_h, state_c = model.init_state(model.sequence_length)

        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            
            if batch % 50 == 0:
                print({ 'Epoch': epoch, 'Batch': batch, 'Loss': loss.item() })

### Generation

In [576]:
from itertools import cycle, islice

def generate(dataset, model, text, next_words=100):
    output = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(model.sequence_length)

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in output[i:]]])        
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        output.append(dataset.index_to_word[word_index])

        return ' '.join(output)

In [577]:
input_text = 'in this paper we propose'

dataset = Dataset(words, sequence_length=5)
model = Model(dataset)

train(dataset, model, max_epochs=1)

{'Epoch': 0, 'Batch': 0, 'Loss': 6.8361334800720215}


In [579]:
generate(dataset, model, text=text, next_words=20)

In [580]:
dataset.index_to_word[62]

'documents'

In [449]:
words = words[:2000]

In [472]:
n_vocab = len(dataset.uniq_words)
dataset.uniq_words[810]

'evaluates'

In [590]:
len(dataset.uniq_words)

928

## Conditioned LSTM with Word2Vec embeddings #2

In [3]:
import re
import random
import pickle

import torch.nn.functional as F

In [4]:
# read file
f = open('data/datasets/abstracts_2021.txt')
text = f.read()
abstracts = text.split('\n')

# count of movie plot summaries
len(abstracts)

1735

In [5]:
abstracts = [re.sub("[^a-z', ]", "", a) for a in abstracts[:500]]

In [7]:
random.sample(abstracts, 2)

['he goal of text ranking is to generate an ordered list of texts retrieved from a corpus in response to a query for a particular task lthough the most common formulation of text ranking is search, instances of the task can also be found in many text processing applications his tutorial provides an overview of text ranking with neural network architectures known as transformers, of which  idirectional ncoder epresentations from ransformers is the bestknown example hese models produce high quality results across many domains, tasks, and settings his tutorial, which is based on the preprint of a forthcoming book to be published by organ and  laypool under the ynthesis ectures on uman anguage echnologies series, provides an overview of existing work as a single point of entry for practitioners who wish to deploy transformers for text ranking in realworld applications and researchers who wish to pursue work in this area e cover a wide range of techniques, grouped into two categories transf

### Preprocessing

In [29]:
# create sequences of default length 5 tokens
def create_seq(text, seq_len = 5):
    
    sequences = []

    # if the number of tokens in 'text' is greater than 5
    if len(text.split()) > seq_len:
      for i in range(seq_len, len(text.split())):
        # select sequence of tokens
        seq = text.split()[i-seq_len:i+1]
        # add to the list
        sequences.append(" ".join(seq))

      return sequences

    # if the number of tokens in 'text' is less than or equal to 5
    else:
      
      return [text]

In [30]:
seqs = [create_seq(a) for a in abstracts]

# merge list-of-lists into a single list
seqs = sum(seqs, []) # could use itertools to improve performance

# count of sequences
len(seqs)

60851

In [31]:
# create inputs and targets (x and y)
x = []
y = []

for s in seqs:
  x.append(" ".join(s.split()[:-1]))
  y.append(" ".join(s.split()[1:]))

In [32]:
type(x)

list

In [33]:
# create integer-to-token mapping
int2token = {}
counter = 0

for w in set(" ".join(abstracts).split()):
  int2token[counter] = w
  counter += 1

# create token-to-integer mapping
token2int = {t: a for a, t in int2token.items()}

token2int["the"], int2token[42]

(7081, 'stale')

In [34]:
vocab_size = len(int2token)
vocab_size

7775

In [35]:
def get_integer_seq(seq):
  return [token2int[w] for w in seq.split()]

# convert text sequences to integer sequences
x_int = [get_integer_seq(i) for i in x]
y_int = [get_integer_seq(i) for i in y]

In [36]:
len(x_int),len(y_int)

(60851, 60851)

In [37]:
# delete all sequences not == 5

x_int = list(filter(lambda x: (len(x) == 5), x_int))
y_int = list(filter(lambda y: (len(y) == 5), y_int))

In [38]:
len(x_int),len(y_int)

(60851, 60851)

In [39]:
# convert lists to numpy arrays
x_int = np.array(x_int)
y_int = np.array(y_int)

In [40]:
def get_batches(arr_x, arr_y, batch_size):
         
    # iterate through the arrays
    prv = 0
    for n in range(batch_size, arr_x.shape[0], batch_size):
      x = arr_x[prv:n]
      y = arr_y[prv:n]
      prv = n
      yield x, y

### Model

In [41]:
from torch import nn

class Model(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.01):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        self.emb_layer = nn.Embedding(vocab_size, 256)

        ## define the LSTM
        self.lstm = nn.LSTM(256, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [46]:
# instantiate the model
model = Model()

model.cpu()

print(model)

Model(
  (emb_layer): Embedding(7776, 256)
  (lstm): LSTM(256, 256, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=7776, bias=True)
)


In [47]:
def train(model, epochs=10, batch_size=32, lr=0.01, clip=1, print_every=32):
    
    # optimizer
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    
    # loss
    criterion = nn.CrossEntropyLoss()
    
    # push model to CPU
    model.cpu()
    
    counter = 0

    model.train()

    for e in range(epochs):

        # initialize hidden state
        h = model.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            counter+= 1
            
            # convert numpy arrays to PyTorch arrays
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            # push tensors to GPU
            inputs, targets = inputs.cpu(), targets.cpu()

            # detach hidden states
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            model.zero_grad()
            
            # get the output from the model
            output, h = model(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(-1).long())

            # back-propagate error
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), clip)

            # update weigths
            opt.step()            
            
            if counter % print_every == 0:
            
              print("Epoch: {}/{} -".format(e+1, epochs),
                    "Step: {} -".format(counter),
                    "Loss: {}".format(loss))

In [48]:
train(model, batch_size=32, epochs=5, print_every=50)

Epoch: 1/5 - Step: 50 - Loss: 7.671520233154297
Epoch: 1/5 - Step: 100 - Loss: 7.775917053222656
Epoch: 1/5 - Step: 150 - Loss: 7.7291579246521


KeyboardInterrupt: 

In [1160]:
# predict next token
def predict(model, tkn, h=None):
         
  # tensor inputs
  x = np.array([[token2int[tkn]]])
  inputs = torch.from_numpy(x)
  
  # push to CPU
  inputs = inputs.cpu()

  # detach hidden state from history
  h = tuple([each.data for each in h])

  # get the output of the model
  out, h = model(inputs, h)

  # get the token probabilities
  p = F.softmax(out, dim=1).data

  p = p.cpu()

  p = p.numpy()
  p = p.reshape(p.shape[1],)

  # get indices of top 3 values
  top_n_idx = p.argsort()[-3:][::-1]

  # randomly select one of the three indices
  sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]

  # return the encoded value of the predicted char and the hidden state
  return int2token[sampled_token_index], h

In [1168]:
# function to generate text
def generate(model, size, prompt='in this paper'):
        
    # push to CPU
    model.cpu()
    
    model.eval()

    # batch size is 1
    h = model.init_hidden(1)

    toks = prompt.split()

    # predict next token
    for t in prompt.split():
      token, h = predict(model, t, h)
    
    toks.append(token)

    # predict subsequent tokens
    for i in range(size-1):
        token, h = predict(model, toks[-1], h)
        toks.append(token)

    return ' '.join(toks)

In [1233]:
generate(model, 50, prompt='temporal word embeddings')

'temporal word embeddings and the corresponding and a new training and the language understanding and a new training models and the corresponding the best neural network on three the same datasets to the model to a novel model can effectively a new training models to a novel reasoning the model can achieve the'

### Embeddings

In [10]:
import gensim
from gensim.models import KeyedVectors

In [11]:
import itertools

words_abstracts = [a.split(' ') for a in abstracts] # could use itertools to improve performance
words = list(itertools.chain(*words_abstracts))

In [12]:
len(words_abstracts), len(words)

(500, 65068)

In [1330]:
words_abstracts[:2]

[['espite',
  'the',
  'recent',
  'successes',
  'of',
  'transformerbased',
  'models',
  'in',
  'terms',
  'of',
  'effectiveness',
  'on',
  'a',
  'variety',
  'of',
  'tasks,',
  'their',
  'decisions',
  'often',
  'remain',
  'opaque',
  'to',
  'humans',
  'xplanations',
  'are',
  'particularly',
  'important',
  'for',
  'tasks',
  'like',
  'offensive',
  'language',
  'or',
  'toxicity',
  'detection',
  'on',
  'social',
  'media',
  'because',
  'a',
  'manual',
  'appeal',
  'process',
  'is',
  'often',
  'in',
  'place',
  'to',
  'dispute',
  'automatically',
  'flagged',
  'content',
  'n',
  'this',
  'work,',
  'we',
  'propose',
  'a',
  'technique',
  'to',
  'improve',
  'the',
  'interpretability',
  'of',
  'these',
  'models,',
  'based',
  'on',
  'a',
  'simple',
  'and',
  'powerful',
  'assumption',
  'a',
  'post',
  'is',
  'at',
  'least',
  'as',
  'toxic',
  'as',
  'its',
  'most',
  'toxic',
  'span',
  'e',
  'incorporate',
  'this',
  'assumpti

In [13]:
w2v_model = gensim.models.Word2Vec(words_abstracts, min_count=1, vector_size=256, epochs=50)

In [20]:
emb_vectors = w2v_model.wv.vectors

In [21]:
emb_vectors.save('vectors.kv')

AttributeError: 'numpy.ndarray' object has no attribute 'save'

In [23]:
emb_vectors.shape

(7776, 256)

In [24]:
emb_tensors = torch.FloatTensor(emb_vectors)

In [25]:
from torchtext.vocab import Vectors
from torchtext import data

In [26]:
def tokenize_fct(text):  # create a tokenizer function
    return [tok.text for tok in nlp.tokenizer(text)]

In [27]:
text_field = data.Field(sequential=True, use_vocab=True,
                        lower=True, tokenize=tokenize_fct)

In [1350]:
test_embs = Vectors(name='vectors.kv')

  0%|                                                             | 0/26056 [00:00<?, ?it/s]


RuntimeError: Vector for token b'\x80\x04\x95\x05\x00\x01\x00\x00\x00\x00\x00\x8c\x1agensim.models.keyedvectors\x94\x8c\x0cKeyedVectors\x94\x93\x94)\x81\x94}\x94(\x8c\x0bvector_size\x94M\x00\x01\x8c\x0cindex_to_key\x94]\x94(\x8c\x03the\x94\x8c\x02of\x94\x8c\x03and\x94\x8c\x00\x94\x8c\x02to\x94\x8c\x01a\x94\x8c\x02in\x94\x8c\x03for\x94\x8c\x02on\x94\x8c\x04that\x94\x8c\x02we\x94\x8c\x02is\x94\x8c\x01e\x94\x8c\x04with\x94\x8c\x04this\x94\x8c\x06models\x94\x8c\x05model\x94\x8c\x02as\x94\x8c\x04from\x94\x8c\x02by\x94\x8c\x08language\x94\x8c\x03are\x94\x8c\x02an\x94\x8c\x01n\x94\x8c\x02he\x94\x8c\x03our\x94\x8c\x04data\x94\x8c\x04task\x94\x8c\x05which\x94\x8c\x02be\x94\x8c\x03can\x94\x8c\x05tasks\x94\x8c\x0bperformance\x94\x8c\x01,\x94\x8c\x02or\x94\x8c\x07results\x94\x8c\x02ur\x94\x8c\x05using\x94\x8c\x03his\x94\x8c\x04show\x94\x8c\x04have\x94\x8c\x08learning\x94\x8c\x08training\x94\x8c\x03has\x94\x8c\x03two\x94\x8c\x05based\x94\x8c\x04text\x94\x8c\x04such\x94\x8c\x02it\x94\x8c\x07propose\x94\x8c\tlanguages\x94\x8c\x08approach\x94\x8c\x07dataset\x94\x8c\tdifferent\x94\x8c\x03new\x94\x8c\x0binformation\x94\x8c\x05their\x94\x8c\x07present\x94\x8c\x02at\x94\x8c\x04more\x94\x8c\x03not\x94\x8c\x05rabic\x94\x8c\x06method\x94\x8c\x05these\x94\x8c\x04both\x94\x8c\x06neural\x94\x8c\x06paper,\x94\x8c\x03use\x94\x8c\x07methods\x94\x8c\x04work\x94\x8c\x04been\x94\x8c\x07between\x94\x8c\x04also\x94\x8c\rstateoftheart\x94\x8c\x05paper\x94\x8c\x01o\x94\x8c\x04over\x94\x8c\x06shared\x94\x8c' has 0 dimensions, but previously read vectors have None dimensions. All vectors must have the same number of dimensions.

In [1348]:
text_field.build_vocab(words, vectors=test_embs)

In [1349]:
print(f'text vocab size {len(text_field.vocab)}')

text vocab size 30


#### Gensim

In [1248]:
dct = Dictionary(words_abstracts)
dct[42]

'its'

In [1282]:
len(dct)

7776

In [1250]:
print(dct)

Dictionary(7776 unique tokens: ['a', 'according', 'analysis', 'and', 'appeal']...)


In [589]:
corpus = [dct.doc2bow(text) for text in tokenized]

### LDA (unordered)

In [None]:
lda = LdaModel(corpus, num_topics=10, id2word=dct)

In [None]:
lda.show_topics()

### Doc2Vec (unordered)

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized)]

In [None]:
doc2vec_model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=4)

### LSA (ordered)

In [None]:
lsi = LsiModel(corpus, id2word=dct, num_topics=10, decay=0.5)

In [None]:
lsi.show_topics()

In [None]:
lsi.show_topic(8, topn=20)

In [None]:
lsi[corpus]

In [None]:
topic_representation = lsi.projection.u

In [None]:
lsi.projection.s.shape

In [1251]:
topic_representation.shape

NameError: name 'topic_representation' is not defined

In [49]:
class Model(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=2, drop_prob=0.2, lr=0.01):
        super().__init__()

        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        vocab_size = emb_vectors.shape[0]
        
        self.emb_layer = nn.Embedding.from_pretrained(emb_tensors)

        ## define the LSTM
        self.lstm = nn.LSTM(256, n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## define the fully-connected layer
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
        
        x = x.long()

        ## pass input through embedding layer
        embedded = self.emb_layer(x)     
        
        ## Get the outputs and the new hidden state from the lstm
        lstm_output, hidden = self.lstm(embedded, hidden)
        
        ## pass through a dropout layer
        out = self.dropout(lstm_output)
        
        #out = out.contiguous().view(-1, self.n_hidden) 
        out = out.reshape(-1, self.n_hidden) 

        ## put "out" through the fully-connected layer
        out = self.fc(out)

        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        # if GPU is available
        if (torch.cuda.is_available()):
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        # if GPU is not available
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                    weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden