In [None]:
import numpy as np
from nltk.tokenize import sent_tokenize,word_tokenize
from tensorflow import keras
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Activation, Embedding, LSTM
from keras import optimizers

# Question 2

In [None]:
import nltk
nltk.download('punkt')

In [None]:
with open('jane_austen.txt','r') as f:
    data = f.read()
    f.close()
data = data.lower()

In [None]:
sent_list = sent_tokenize(data)

In [None]:
sent_words=[]
words={}
mod_sent_list=[]
for s in sent_list:
    temp = word_tokenize(s)
    w_list=['sss']
    for w in temp:
        if w.isalpha() or (('-' in w) and len(w)>1):
            w_list.append(w)
            if w not in words:
                words[w] = 1
            else:
                words[w]+=1
    w_list.append('eee')
    sent_words.append(w_list)
    mod_sent_list.append(' '.join(w_list))
mod_corpus = ' '.join(mod_sent_list)
words['sss']=len(sent_list)
words['eee']=len(sent_list)

In [None]:
tokens = word_tokenize(mod_corpus)

In [None]:
def train_test_split(train_split, sent_list):
    l = np.shape(sent_list)[0]
    tr_ind = int(train_split*l)
    train_data = sent_list[:tr_ind]
    test_data = sent_list[tr_ind:]
    return train_data, test_data

In [None]:
train_data, test_data = train_test_split(0.8, mod_sent_list)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
N = len(tokenizer.word_index) + 1

In [None]:
print(train_data[1])
tokenizer.texts_to_sequences([train_data[1]])

In [None]:
input_sequences = []
for s in train_data:
    token_num = tokenizer.texts_to_sequences([s])[0]
    for i in range(0, len(token_num)):
            n_gram_sequence = token_num[:i+1]
            input_sequences.append(n_gram_sequence)
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = to_categorical(label, num_classes=N)

In [None]:
def LSTM_model(N,max_sequence_len):
    model = Sequential()
    model.add(Embedding(N, 10, input_length=max_sequence_len-1))
#     model.add(LSTM(150, return_sequences = True))
    model.add(LSTM(100))
    model.add(Dense(N, activation='softmax'))
    
    adam = optimizers.Adam(lr = 0.1)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

def vanilla_RNN(N,max_sequence_len):
    model = Sequential()
    model.add(Embedding(N, 30, input_length=max_sequence_len-1))
    model.add(SimpleRNN(50, input_shape = (30,1)))
    model.add(Dense(N, activation='softmax'))
    
    adam = optimizers.Adam(lr = 0.001)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [None]:
rnn = vanilla_RNN(N,max_sequence_len)
lstm = LSTM_model(N,max_sequence_len)

rnn.fit(predictors[:1000,:],label[:1000,:],epochs=10)
lstm.fit(predictors[:1000,:],label[:1000,:],epochs=10)

Since the training was taking too much time, RNN and LSTM model were run only for 1000 sentences.

In [None]:
rnn_text=['are']
while(True):
    token_num = tokenizer.texts_to_sequences([rnn_text])[0]
    token_num = pad_sequences([token_num], maxlen=max_sequence_len-1, padding='pre')
    pred = rnn.predict_classes(token_num, verbose=1)
    for w, ind in tokenizer.word_index.items():
        if ind == pred:
            rnn_text.append(w)
            break
    if w == 'eee':
        break
lstm_text=['sss']
while(True):
    token_num = tokenizer.texts_to_sequences(lstm_text)[0]
    token_num = pad_sequences([token_num], maxlen=max_sequence_len-1, padding='pre')
    pred = lstm.predict_classes(token_num, verbose=0)
    for w, ind in tokenizer.word_index.items():
        if ind == pred:
            lstm_text.append(w)
            break
    if w == 'eee':
        break

In [66]:
print('Text generated from RNN: ',rnn_text)
print('Text generated from LSTM: ',lstm_text)

Text generated from RNN:  ['are', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']
Text generated from LSTM:  ['sss', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of', 'of']


Since the model was only run over 1000 sentences, the generated text is biased towards the 'the' and 'of' for the models RNN and LSTM respectively.

In [65]:
rnn_perp = np.exp(rnn.history.history['loss'][-1])
lstm_perp = np.exp(lstm.history.history['loss'][-1])

print('Perplexity on vanilla RNN model: ',rnn_perp)
print('Perplexity on LSTM: ',lstm_perp)

Perplexity on vanilla RNN model:  253.93716074496
Perplexity on LSTM:  262.9122704384553


The perplexity is calculated as $e^{loss of the model}$

Neural model tends to behave better as it can assert long distance relationship between the words whereas N-grams can only do that upto some extent. Since the model in this assignment wasn't trained completely, the power of the RNN and LSTM is not visible.

Reference:
    https://medium.com/@shivambansal36/language-modelling-text-generation-using-lstms-deep-learning-for-nlp-ed36b224b275
Collaborated with Prateek Chennuri(16110042)