In [3]:
#Importing necessary libraries

import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from tensorflow.keras.layers import Dense, Activation
from keras.optimizers import RMSprop
import pickle
import heapq

**Dataset Information**

We have chosen 'The Adventures of Sherlock Holmes' by Sir Arthur Conan Doyle as our primary dataset.


'SHERLOCK.txt', the text file from which we are extracting words, contains the entire transcript of the novel in simple text for without formatting.


 Datasets like these are beneficial for testing text prediction software since they don't include any slang, slurred words or shorthand that may throw off the Neural Network framework. The use of British English also ensures minimal grammatical error possibility as it is a widely recognized writing system across the world.

In [4]:
#Loading the dataset

path = 'SHERLOCK.txt'
text = open(path).read().lower() #Reading dataset and converting to lowercase
print('Text length:', len(text)) #Text length (including spaces)

Text length: 581889


In [5]:
#Split the dataset into each word without the presence of special characters

tokenizer = RegexpTokenizer(r'\w+') #Reading word characters and storing in an array
words = tokenizer.tokenize(text)    #Splitting and storing as list
print(words[:5])                     #Printing word list

['project', 'gutenberg', 's', 'the', 'adventures']


In [None]:
unique_words = np.unique(words)                                        #Finds unique elements from our words list
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))   #Running a for loop to create dictionary with words assigned to index values

In [None]:
WORD_LENGTH = 5                                     #No. of previous words taken into consideration
prev_words = []                                     #List containing previous words (according to WORD_LENGTH)
next_words = []                                     #List containing next word
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])     #Appending previous words in each iteration to prev_words list
    next_words.append(words[i + WORD_LENGTH])       #Appending next words in each iteration to next_words list
print(prev_words[0])
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


In [None]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)         #Zero array for prev_words with boolean datatype
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)                      #Zero array for next_words with boolean datatype
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1                                   #Checking whether value corresponds to 'True' for X
    Y[i, unique_word_index[next_words[i]]] = 1                                      #Checking whether value corresponds to 'True' for Y

In [None]:
print(X[0][0])

[False False False ... False False False]


**Building the model**

In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))  #LSTM utilises 128 neurons
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

**Training & model creation**

In [None]:
optimizer = RMSprop(lr=0.01)                                                                       #Root Mean Square of error with learning rate = 0.01
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])          #Specifying the training configuration
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history   #Applying the training configuration and creating our model

Epoch 1/2
Epoch 2/2


**Loading the model for evaluation**

In [None]:
#Saving our trained model for future use
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
#Model characteristics evaluation
history

{'accuracy': [0.10810628533363342, 0.1470426619052887],
 'loss': [6.00450325012207, 5.780788421630859],
 'val_accuracy': [0.10161112993955612, 0.10637129098176956],
 'val_loss': [7.069894313812256, 7.935789108276367]}

**Setting up prediction**

In [None]:
#Creating a sample vector with zeros
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))       #New array with zeros equal to length of unique_words
    for t, word in enumerate(text.split()):
        print(word)                                         #Printing each word in input text
        x[0, t, unique_word_index[word]] = 1                #Assigning value of 1 in zero array to all words in input text
    return x

prepare_input("It is not a lack".lower())                   #Calling our new sample vector with input text

it
is
not
a
lack


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

**Text Prediction**

In [None]:
#Sentence 1

q=input("Enter correct sentence: ")
#q =  "Your life will never be the same again"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

Enter correct sentence: Your life will never be the same again
correct sentence:  Your life will never be the same again
Sequence:  your life will never be
your
life
will
never
be
next possible words:  ['a', 'the', 'of', 'so', 'very']


In [None]:
#Sentence 2

q=input("Enter correct sentence: ")
#q =  "As I glanced at the"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

Enter correct sentence: As I glanced at the
correct sentence:  As I glanced at the
Sequence:  as i glanced at the
as
i
glanced
at
the
next possible words:  ['door', 'time', 'and', 'house', 'corner']


In [None]:
#Sentence 3

q=input("Enter correct sentence: ")
#q =  "What could be the reason of his overpowering terror"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

Enter correct sentence: What could be the reason of his overpowering terror
correct sentence:  What could be the reason of his overpowering terror
Sequence:  what could be the reason
what
could
be
the
reason
next possible words:  ['of', 'to', 'that', 'i', 'and']


**Creating a Function for Text Prediction**

In [None]:
def text_pred():
  q=input("Enter correct sentence: ")
  print("Correct Sentence: ",q)
  seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
  print("Sequence: ",seq)
  print("Next Possible Words: ", predict_completions(seq, 5))

In [None]:
text_pred()

Enter correct sentence: Over last few years I
Correct Sentence:  Over last few years I
Sequence:  over last few years i
over
last
few
years
i
Next Possible Words:  ['have', 'had', 'was', 'am', 'found']


In [None]:
text_pred()

Enter correct sentence: Over last few years I
Correct Sentence:  Over last few years I
Sequence:  over last few years i
over
last
few
years
i
Next Possible Words:  ['had', 'have', 'was', 'could', 'think']


In [None]:
text_pred()

Enter correct sentence: I knew little of my
Correct Sentence:  I knew little of my
Sequence:  i knew little of my
i
knew
little
of
my
Next Possible Words:  ['own', 'that', 'mind', 'room', 'house']


In [None]:
text_pred()

Enter correct sentence: I should have thought a
Correct Sentence:  I should have thought a
Sequence:  i should have thought a
i
should
have
thought
a
Next Possible Words:  ['to', 'very', 'little', 'of', 'in']


In [None]:
text_pred()

Enter correct sentence: How do I know that
Correct Sentence:  How do I know that
Sequence:  how do i know that
how
do
i
know
that
Next Possible Words:  ['i', 'it', 'he', 'you', 'the']


In [None]:
text_pred()

Enter correct sentence: my wife has given her
Correct Sentence:  my wife has given her
Sequence:  my wife has given her
my
wife
has
given
her
Next Possible Words:  ['to', 'the', 'a', 'his', 'her']


In [None]:
text_pred()

Enter correct sentence: He chuckled to himself and
Correct Sentence:  He chuckled to himself and
Sequence:  he chuckled to himself and
he
chuckled
to
himself
and
Next Possible Words:  ['i', 'a', 'he', 'his', 'the']


In [None]:
text_pred()

Enter correct sentence: Obviously they have been caused
Correct Sentence:  Obviously they have been caused
Sequence:  obviously they have been caused
obviously
they
have
been
caused
Next Possible Words:  ['to', 'in', 'by', 'upon', 'for']


In [None]:
text_pred()

Enter correct sentence: A man entered who could
Correct Sentence:  A man entered who could
Sequence:  a man entered who could
a
man
entered
who
could
Next Possible Words:  ['not', 'have', 'be', 'see', 'hardly']
