## Level 2 - Word Prediction using LSTM

Follow the same steps as in Char Prediction (Level 1) but at the word level than at the Char Level. 

### Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.model_selection import train_test_split

# KERAS API
import keras
# Sequence to attain Padding
from keras.preprocessing import sequence
# Importing RNN's LSTM
from keras.layers import LSTM, Dense
from keras.layers import Embedding
# Applying Sequential algorithm to model
from keras.models import Sequential


# NLTK PACKAGE
import nltk
# Tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer, RegexpTokenizer
# Stemming and Lemmatizing
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Stopwords
from nltk.corpus import stopwords, state_union

Using TensorFlow backend.


### Storing the Document

In [2]:
file = open('AliceInWonderland').read()

#### Words Tokenizer

In [3]:
words = word_tokenize(file)

In [4]:
len(words)

35585

#### Calculating number of Unique Letters in the document

In [5]:
unique_words = list(set(words))
unique_output_values = len(unique_words)
unique_output_values

4054

#### Conversions

In [6]:
word_to_int = dict(zip(unique_words, [i for i in range(len(unique_words))]))
int_to_word = dict(zip([i for i in range(len(unique_words))],unique_words))

### METHODS

In [7]:
''' SLIDING FUNCTION: Slides over the input text file word by word'''

def generate_word_Dataset(data, slide):
    
    x = []
    y = []
    
    for index in range(len(data) - slide):
        x.append([word for word in data[index:index+slide]])
        y.append(data[index+slide:index+slide+1])
        
    return x,y

In [8]:
''' WORD TO INT CONVERSION FUNCTION: Converts words dataset to int dataset '''

def word_Dataset_to_int_Dataset(x,y, word_to_int):
    
    input_to_int = []
    output_to_int = []
    
    for index in range(len(x)):
        input_to_int.append([word_to_int[word] for word in x[index]])
        output_to_int.append([word_to_int[word] for word in y[index]])
    
    return input_to_int, output_to_int

In [9]:
''' (BACK) INT TO WORD CONVERSION FUNCTION: Accepts y(i.e. List of lists) '''

def int_Dataset_to_word_Dataset(y, int_to_word):
    
    back_to_word = []

    for index in range(len(y)):
        back_to_word.append([int_to_word[y[index][0]]])
        
    return back_to_word

In [10]:
''' INTIALIZATION FUNCTION: Accepts tokenized words, slide, list of unique words from the doc '''

def initialize(data, slide, word_to_int):
    
    word_Dataset = generate_word_Dataset(data, slide)
    int_Dataset = word_Dataset_to_int_Dataset(word_Dataset[0], word_Dataset[1], word_to_int)
    
    # INPUT: e.g. [[12,21,34], [12,33,41], ...] - List of Lists
    seqInput = int_Dataset[0]
    
    # OUTPUT: e.g. flatten([[12],[24],[2],[5] ...] - List of Lists = [12,24,2,5....]
    seqOutput = list(np.array(int_Dataset[1]).flatten())
    
    seqInput_RESHAPED = np.array(seqInput).reshape(len(seqInput), slide, 1)
    
    return seqInput_RESHAPED, seqOutput

### Initializing

In [11]:
DATA_SET = initialize(words, 100, word_to_int)

X = DATA_SET[0]
Y = DATA_SET[1]

In [12]:
''' X=(35485, 100, 1) 

    Number of samples = 35485
    Number of inputs  = 100 (Word1, Word2...., Word100)
               Output = 1 (Word101th)
'''

X.shape

(35485, 100, 1)

### Defining Paramters

In [13]:
num_words = 20000
batch_size = 32
epochs = 2
num_classes = unique_words

### Training and Testing units

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=1)

In [15]:
print("X: - TOTAL WORDS =", len(X), "TRAIN=", x_train.shape, "TEST=", x_test.shape,"\nY: - TOTAL WORDS =", len(Y), "TRAIN=", len(y_train), "TEST=",len(y_test))

X: - TOTAL WORDS = 35485 TRAIN= (35130, 100, 1) TEST= (355, 100, 1) 
Y: - TOTAL WORDS = 35485 TRAIN= 35130 TEST= 355


### One-Hot-Encoding

In [16]:
y_train_oneHotEncoded = keras.utils.to_categorical(y_train, num_classes=unique_output_values)
y_test_oneHotEncoded = keras.utils.to_categorical(y_test, num_classes=unique_output_values)

In [17]:
y_test_oneHotEncoded.shape

(355, 4054)

### Model

In [19]:
# Model Architecture:  Consists of 2 LSTM Layers and a Output Dense Layer

#       t0 - t1 - t2 - t3 ------ t99    = Sample 1                
#       t0 - t1 - t2 - t3 ------ t99    = Sample 2
#                                         ...
#       t0 - t1 - t2 - t3 ------ t99    = Sample 35485 

model = Sequential()
model.add(LSTM(64, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences = True))
model.add(LSTM(128))
model.add(Dense(unique_output_values, activation="sigmoid"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train_oneHotEncoded, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test_oneHotEncoded))

Train on 35130 samples, validate on 355 samples
Epoch 1/1


<keras.callbacks.History at 0x7f7e0b945e80>

##### Loading Dropouts

In [21]:
#model.add(Dropout(32, input_shape=(x_train.shape[1], x_train.shape[2]))

### Predictions

In [22]:
predict = model.predict(x_test)

### Accuracy

In [23]:
evaluate = model.evaluate(x_test, y_test_oneHotEncoded)



In [24]:
accuracy = evaluate[1]
accuracy*100

5.6338028169014089

### TEST INPUT

In [25]:
test_input = file[:5000]

In [26]:
len(word_tokenize(test_input))

1066

In [27]:
test_input_seqIn_withChar,test_input_seqOut_withChar = generate_word_Dataset(word_tokenize(test_input), 100)

In [28]:
'''Converting char input sequence to Integer'''
test_input_seqIn = []

for i in range(len(test_input_seqIn_withChar)):
    test_input_seqIn.append([word_to_int[letter] for letter in test_input_seqIn_withChar[i]])

In [29]:
'''Reshaping seqIn sample'''

test_input_seqIn_reshape = np.array(test_input_seqIn).reshape(np.array(test_input_seqIn).shape[0], np.array(test_input_seqIn).shape[1], 1)

In [30]:
test_input_seqIn_reshape.shape

(966, 100, 1)

In [31]:
predictions = model.predict_classes(test_input_seqIn_reshape)



#### Storing inputs and outputs in a proper string

In [33]:
input = []

for i in range(len(test_input_seqIn_withChar)):
    input.append(''.join(test_input_seqIn_withChar[i]))

In [34]:
output = []

for i in predictions:
    output.append(int_to_word[i])

### OUTPUT

In [35]:
input[0]

'ProjectGutenberg’sAlice’sAdventuresinWonderland,byLewisCarrollThiseBookisfortheuseofanyoneanywhereatnocostandwithalmostnorestrictionswhatsoever.Youmaycopyit,giveitawayorre-useitunderthetermsoftheProjectGutenbergLicenseincludedwiththiseBookoronlineatwww.gutenberg.orgTitle:Alice’sAdventuresinWonderlandAuthor:LewisCarrollPostingDate:June25,2008[EBook#11]ReleaseDate:March,1994LastUpdated:October6,2016Language:EnglishCharactersetencoding:UTF-8***'

In [36]:
output[0]

'the'

In [37]:
''.join(output)

'thethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethethe