In [28]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

data = gutenberg.raw("shakespeare-hamlet.txt")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\deepa\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
print(data)

[The Tragedie of Hamlet by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Barnardo and Francisco two Centinels.

  Barnardo. Who's there?
  Fran. Nay answer me: Stand & vnfold
your selfe

   Bar. Long liue the King

   Fran. Barnardo?
  Bar. He

   Fran. You come most carefully vpon your houre

   Bar. 'Tis now strook twelue, get thee to bed Francisco

   Fran. For this releefe much thankes: 'Tis bitter cold,
And I am sicke at heart

   Barn. Haue you had quiet Guard?
  Fran. Not a Mouse stirring

   Barn. Well, goodnight. If you do meet Horatio and
Marcellus, the Riuals of my Watch, bid them make hast.
Enter Horatio and Marcellus.

  Fran. I thinke I heare them. Stand: who's there?
  Hor. Friends to this ground

   Mar. And Leige-men to the Dane

   Fran. Giue you good night

   Mar. O farwel honest Soldier, who hath relieu'd you?
  Fra. Barnardo ha's my place: giue you goodnight.

Exit Fran.

  Mar. Holla Barnardo

   Bar. Say, what is Horatio there?
  Hor. A peece of

In [4]:
with open("hamlet.txt","w") as file:
    file.write(data)

In [5]:
# Loading the dataset

with open("hamlet.txt","r") as file:
    text = file.read().lower()

In [6]:
text



In [7]:
# Tokenize the text

tokenizer = Tokenizer()

tokenizer.fit_on_texts([text])

In [8]:
len(tokenizer.word_index)+1

4818

In [29]:
total_words = len(tokenizer.word_index)+1

In [9]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'it': 9,
 'in': 10,
 'that': 11,
 'ham': 12,
 'is': 13,
 'not': 14,
 'his': 15,
 'this': 16,
 'with': 17,
 'your': 18,
 'but': 19,
 'for': 20,
 'me': 21,
 'lord': 22,
 'as': 23,
 'what': 24,
 'he': 25,
 'be': 26,
 'so': 27,
 'him': 28,
 'haue': 29,
 'king': 30,
 'will': 31,
 'no': 32,
 'our': 33,
 'we': 34,
 'on': 35,
 'are': 36,
 'if': 37,
 'all': 38,
 'then': 39,
 'shall': 40,
 'by': 41,
 'thou': 42,
 'come': 43,
 'or': 44,
 'hamlet': 45,
 'good': 46,
 'do': 47,
 'hor': 48,
 'her': 49,
 'let': 50,
 'now': 51,
 'thy': 52,
 'how': 53,
 'more': 54,
 'they': 55,
 'from': 56,
 'enter': 57,
 'at': 58,
 'was': 59,
 'oh': 60,
 'like': 61,
 'most': 62,
 'there': 63,
 'well': 64,
 'know': 65,
 'selfe': 66,
 'would': 67,
 'them': 68,
 'loue': 69,
 'may': 70,
 "'tis": 71,
 'vs': 72,
 'sir': 73,
 'qu': 74,
 'which': 75,
 'did': 76,
 'why': 77,
 'laer': 78,
 'giue': 79,
 'thee': 80,
 'ile': 81,
 'must': 82,
 'hath': 

In [10]:
tokenizer.word_counts

OrderedDict([('the', 993),
             ('tragedie', 4),
             ('of', 610),
             ('hamlet', 100),
             ('by', 105),
             ('william', 1),
             ('shakespeare', 1),
             ('1599', 1),
             ('actus', 2),
             ('primus', 1),
             ('scoena', 1),
             ('prima', 1),
             ('enter', 85),
             ('barnardo', 8),
             ('and', 862),
             ('francisco', 2),
             ('two', 22),
             ('centinels', 1),
             ("who's", 2),
             ('there', 76),
             ('fran', 8),
             ('nay', 26),
             ('answer', 9),
             ('me', 228),
             ('stand', 15),
             ('vnfold', 3),
             ('your', 253),
             ('selfe', 68),
             ('bar', 7),
             ('long', 17),
             ('liue', 15),
             ('king', 171),
             ('he', 196),
             ('you', 522),
             ('come', 104),
             ('most', 77),
  

In [14]:
# Input sequence

input_sequence = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    # print(token_list)
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequence.append(n_gram_sequence)
input_sequence

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [37]:
max_seq_len = max([len(x) for x in input_sequence])
max_seq_len

14

In [38]:
input_sequence = np.array(pad_sequences(input_sequence,maxlen=max_seq_len,padding="pre"))

In [39]:
input_sequence

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [40]:
print(input_sequence)

[[   0    0    0 ...    0    1  687]
 [   0    0    0 ...    1  687    4]
 [   0    0    0 ...  687    4   45]
 ...
 [   0    0    0 ...    4   45 1047]
 [   0    0    0 ...   45 1047    4]
 [   0    0    0 ... 1047    4  193]]


In [41]:
x,y = input_sequence[:,:-1],input_sequence[:,-1]

In [42]:
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       ...,
       [   0,    0,    0, ...,  687,    4,   45],
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4]], dtype=int32)

In [43]:
y

array([ 687,    4,   45, ..., 1047,    4,  193], dtype=int32)

In [44]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
# Train test split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [46]:
X_train

array([[   0,    0,    0, ...,   38,    1,  272],
       [   0,    0,    0, ..., 3108, 1260, 1590],
       [   0,    0,    0, ...,    5,   92,   62],
       ...,
       [   0,    0,    0, ...,    0,    0,  100],
       [   0,    0,    0, ...,    0,    0,   19],
       [   0,    0,    0, ..., 2629,    4,  111]], dtype=int32)

In [47]:
X_test

array([[   0,    0,    0, ...,    0, 3165,    3],
       [   0,    0,    0, ...,    2,   85,   14],
       [   0,    0,    0, ...,   21,  213,    5],
       ...,
       [   0,    0,    0, ...,   88,   34,   10],
       [   0,    0,    0, ...,    0,    6,  158],
       [   0,    0,    0, ...,  653,  779,    3]], dtype=int32)

In [48]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [49]:
y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
# Training the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [59]:
model = Sequential()
model.add(Embedding(total_words,100,input_length = max_seq_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])

In [60]:
model.summary()

In [61]:
model.fit(X_train,y_train,
          epochs=50,verbose=1,
          validation_data=(X_test,y_test)
          )

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 25ms/step - accuracy: 0.0307 - loss: 7.1392 - val_accuracy: 0.0373 - val_loss: 6.7759
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 25ms/step - accuracy: 0.0354 - loss: 6.4896 - val_accuracy: 0.0379 - val_loss: 6.8807
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.0470 - loss: 6.2977 - val_accuracy: 0.0507 - val_loss: 6.9060
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 22ms/step - accuracy: 0.0549 - loss: 6.1369 - val_accuracy: 0.0505 - val_loss: 6.9150
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.0596 - loss: 5.9867 - val_accuracy: 0.0515 - val_loss: 6.9628
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 23ms/step - accuracy: 0.0643 - loss: 5.8407 - val_accuracy: 0.0593 - val_loss: 7.0112
Epoch 7/50
[1m6

<keras.src.callbacks.history.History at 0x184ac096850>

In [66]:
# Prediction function

def predict_next_word(model,tokenizer,text,max_seq_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_seq_len:
        token_list = token_list[-(max_seq_len-1):]
    token_list=pad_sequences([token_list],padding="pre",maxlen=max_seq_len-1)

    predicted = model.predict(token_list,verbose=0)
    predicted_word_index = np.argmax(predicted,axis=1)
    for word,index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [71]:
input_text = "He may approue our"
print(f"My input is: {input_text}")

max_seq_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_seq_len)

print(f"Next Word: {next_word}")

My input is: He may approue our
Next Word: eyes


In [72]:
# Saving the mode and tokenizer

model.save("next_word_lstm.h5")



In [73]:
import pickle
with open("tokenizer.pkl","wb") as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)
