In [15]:
import nltk 
#nltk.download('gutenberg')

from nltk.corpus import gutenberg
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from  sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout  
from tensorflow.keras.callbacks import EarlyStopping


In [3]:
data = gutenberg.raw('shakespeare-hamlet.txt')

#save the file
with open('hamlet.txt', 'w') as file:
    file.write(data)


In [4]:
## load the dataset

with open('hamlet.txt', 'r') as file:
    text = file.read().lower()

#Tokenize the text

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words


4818

In [5]:
#Creating input sequences 

inputsequences = []

for line in text.split("\n"):
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    inputsequences.append(n_gram_sequence)

In [6]:
#Pad Sequences 

max_sequence_len = max([len(x) for x in inputsequences])
inputsequences = np.array(pad_sequences(inputsequences, maxlen=max_sequence_len, padding='pre'))

In [7]:
#Create Predictors and label

X, y = inputsequences[:,:-1], inputsequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)  #Converting y into categorical value

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
callbacks = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [13]:
##Training our LSTM Model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation = "softmax"))

#Compile the model

model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 13, 100)           481800    
                                                                 
 lstm_2 (LSTM)               (None, 13, 150)           150600    
                                                                 
 dropout_1 (Dropout)         (None, 13, 150)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               100400    
                                                                 
 dense_1 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1219418 (4.65 MB)
Trainable params: 1219418 (4.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
#Fitting the model 

history = model.fit(X_train, y_train, epochs=50, validation_data=[X_test, y_test], verbose=1, callbacks=[callbacks])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [24]:
#preditcion function 

def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]

    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    
    token_list = pad_sequences([token_list], maxlen= max_sequence_len-1, padding = 'pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)

    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None
 

input_text = "To be or not be"
print(f"Input Text:{input_text}")
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
print(f"Next word prediction:{next_word}")


model.save("next_word_lstm.h5")
#Save tokenizer



Input Text:To be or not be
Next word prediction:blame


In [27]:
import pickle
with open("tokenizer.pickle", 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)