# Next Word Prediction:

### Importing The Required Libraries:

In [1]:
# !pip install tensorflow
# !pip install keras
# !pip install pydot
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
    Remove all the unnecessary data and label it as Metamorphosis-clean.
    The starting and ending lines should be as follows.

"""


file = open("metamorphosis_clean.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  ﻿One morning, when Gregor Samsa woke from troubled dreams, he found

The Last Line:  first to get up and stretch out her young body.


### Cleaning the data:

In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
# data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data = data.replace('\r', '').replace('\ufeff', '')
data[:360]

'One morning, when Gregor Samsa woke from troubled dreams, he found\n himself transformed in his bed into a horrible vermin.  He lay on\n his armour-like back, and if he lifted his head a little he could\n see his brown belly, slightly domed and divided by arches into stiff\n sections.  The bedding was hardly able to cover it and seemed ready\n to slide off any mo'

In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'One morning  when Gregor Samsa woke from troubled dreams  he found\n himself transformed in his bed into a horrible vermin   He lay on\n his armour like back  and if he lifted his head a little he could\n see his brown belly  slightly domed and divided by arches into stiff\n sections   The bedding was hardly able to cover it and seemed ready\n to slide off any moment   His many legs  pitifully thin compared\n with the size of the rest of him  waved about helplessly as he\n looked \n \n  What s happened t'

In [5]:
z = []

for i in data.split():
#     print(i)
    if i.lower() not in z:
        z.append(i)

data = ' '.join(z)
data[:500]
# print(data.split())

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. lay on armour-like back, and if lifted head little could see brown belly, slightly domed divided by arches stiff sections. The bedding was hardly able to cover it seemed ready slide off any moment. many legs, pitifully thin compared with the size of rest him, waved about helplessly as looked. "What\'s happened me?" thought. wasn\'t dream. room, proper human room although too sm'

### Tokenization:

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[31, 49, 703, 1, 4, 704, 280, 705, 281, 16]

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2617


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  4229


array([[ 31,  49],
       [ 49, 703],
       [703,   1],
       [  1,   4],
       [  4, 704],
       [704, 280],
       [280, 705],
       [705, 281],
       [281,  16],
       [ 16, 706]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 31  49 703   1   4]
The responses are:  [ 49 703   1   4 704]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Creating the Model:

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             26170     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 2617)              2619617   
Total params: 15,694,787
Trainable params: 15,694,787
Non-trainable params: 0
_________________________________________________________________


### Plot The Model:

In [14]:
# https://graphviz.gitlab.io/download/
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


### Callbacks:

In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [16]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))

In [17]:
# model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd",
#               metrics=["accuracy"])

### Fit The Model:

In [18]:
# history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))
# history = model.fit(X, y, epochs=20)

In [19]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/150

Epoch 00001: loss improved from inf to 7.84613, saving model to nextword1.h5
Epoch 2/150

Epoch 00002: loss improved from 7.84613 to 7.58893, saving model to nextword1.h5
Epoch 3/150

Epoch 00003: loss improved from 7.58893 to 7.36449, saving model to nextword1.h5
Epoch 4/150

Epoch 00004: loss improved from 7.36449 to 7.13444, saving model to nextword1.h5
Epoch 5/150

Epoch 00005: loss improved from 7.13444 to 6.98182, saving model to nextword1.h5
Epoch 6/150

Epoch 00006: loss improved from 6.98182 to 6.84596, saving model to nextword1.h5
Epoch 7/150

Epoch 00007: loss improved from 6.84596 to 6.73612, saving model to nextword1.h5
Epoch 8/150

Epoch 00008: loss improved from 6.73612 to 6.63589, saving model to nextword1.h5
Epoch 9/150

Epoch 00009: loss improved from 6.63589 to 6.54805, saving model to nextword1.h5
Epoch 10/150

Epoch 00010: loss improved from 6.54805 to 6.45725, saving model to nextword1.h5
Epoch 11/150

Epoch 00011: loss improved from 6.45725 to 6.3699

Epoch 51/150

Epoch 00051: loss improved from 3.85220 to 3.79048, saving model to nextword1.h5
Epoch 52/150

Epoch 00052: loss improved from 3.79048 to 3.73686, saving model to nextword1.h5
Epoch 53/150

Epoch 00053: loss improved from 3.73686 to 3.71318, saving model to nextword1.h5
Epoch 54/150

Epoch 00054: loss improved from 3.71318 to 3.70286, saving model to nextword1.h5
Epoch 55/150

Epoch 00055: loss improved from 3.70286 to 3.67178, saving model to nextword1.h5
Epoch 56/150

Epoch 00056: loss improved from 3.67178 to 3.64353, saving model to nextword1.h5
Epoch 57/150

Epoch 00057: loss improved from 3.64353 to 3.61776, saving model to nextword1.h5
Epoch 58/150

Epoch 00058: loss improved from 3.61776 to 3.57400, saving model to nextword1.h5
Epoch 59/150

Epoch 00059: loss improved from 3.57400 to 3.55194, saving model to nextword1.h5
Epoch 60/150

Epoch 00060: loss improved from 3.55194 to 3.53041, saving model to nextword1.h5
Epoch 61/150

Epoch 00061: loss improved from 3.53


Epoch 00101: loss improved from 2.20727 to 2.14648, saving model to nextword1.h5
Epoch 102/150

Epoch 00102: loss improved from 2.14648 to 2.14201, saving model to nextword1.h5
Epoch 103/150

Epoch 00103: loss improved from 2.14201 to 2.13201, saving model to nextword1.h5
Epoch 104/150

Epoch 00104: loss improved from 2.13201 to 2.11901, saving model to nextword1.h5
Epoch 105/150

Epoch 00105: loss improved from 2.11901 to 2.07606, saving model to nextword1.h5
Epoch 106/150

Epoch 00106: loss improved from 2.07606 to 2.06745, saving model to nextword1.h5
Epoch 107/150

Epoch 00107: loss improved from 2.06745 to 2.02884, saving model to nextword1.h5
Epoch 108/150

Epoch 00108: loss did not improve from 2.02884
Epoch 109/150

Epoch 00109: loss did not improve from 2.02884
Epoch 110/150

Epoch 00110: loss improved from 2.02884 to 2.00454, saving model to nextword1.h5
Epoch 111/150

Epoch 00111: loss improved from 2.00454 to 1.98504, saving model to nextword1.h5
Epoch 112/150

Epoch 00112

<tensorflow.python.keras.callbacks.History at 0x1b1090096d0>

### Graph:

In [None]:
# https://stackoverflow.com/questions/26649716/how-to-show-pil-image-in-ipython-notebook
# tensorboard --logdir="./logsnextword1"
# http://DESKTOP-U3TSCVT:6006/

# from IPython.display import Image 
# pil_img = Image(filename='graph1.png')
# display(pil_img)

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt

# pd.DataFrame(X, y).plot(figsize=(8, 5))
# plt.grid(True)
# plt.gca().set_ylim(0, 1) 
# plt.show()

## Observation:
### We are able to develop a decent next word prediction model and are able to get a declining loss and an overall decent performance.