In [30]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

data = gutenberg.raw('shakespeare-macbeth.txt')

with open('macbeth.txt', 'w') as f:
    f.write(data)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\eddie\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [31]:
##data processing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

with open('macbeth.txt', 'r') as f:
    text = f.read().lower()

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
total_words

3553

In [32]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'a': 6,
 'that': 7,
 'my': 8,
 'you': 9,
 'in': 10,
 'is': 11,
 'not': 12,
 'it': 13,
 'with': 14,
 'his': 15,
 'be': 16,
 'macb': 17,
 'your': 18,
 'our': 19,
 'haue': 20,
 'but': 21,
 'me': 22,
 'he': 23,
 'for': 24,
 'what': 25,
 'this': 26,
 'all': 27,
 'so': 28,
 'him': 29,
 'as': 30,
 'thou': 31,
 'we': 32,
 'enter': 33,
 'which': 34,
 'are': 35,
 'will': 36,
 'they': 37,
 'shall': 38,
 'no': 39,
 'then': 40,
 'macbeth': 41,
 'their': 42,
 'thee': 43,
 'vpon': 44,
 'on': 45,
 'macd': 46,
 'from': 47,
 'yet': 48,
 'thy': 49,
 'vs': 50,
 'come': 51,
 'king': 52,
 'now': 53,
 'at': 54,
 'hath': 55,
 'more': 56,
 'by': 57,
 'good': 58,
 'rosse': 59,
 'them': 60,
 'lady': 61,
 'would': 62,
 'time': 63,
 'was': 64,
 'do': 65,
 'who': 66,
 'like': 67,
 'her': 68,
 'if': 69,
 'should': 70,
 'did': 71,
 'when': 72,
 'there': 73,
 'say': 74,
 'were': 75,
 'where': 76,
 'doe': 77,
 'lord': 78,
 'make': 79,
 'or': 80,
 '1': 81,
 'must': 82,

In [33]:
inputsequences = []
for line in text.split('\n'):
    #covert text to sequences
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        inputsequences.append(n_gram_sequence)

In [34]:
inputsequences

[[1, 885],
 [1, 885, 4],
 [1, 885, 4, 41],
 [1, 885, 4, 41, 57],
 [1, 885, 4, 41, 57, 1388],
 [1, 885, 4, 41, 57, 1388, 1389],
 [1, 885, 4, 41, 57, 1388, 1389, 1390],
 [418, 1391],
 [418, 1391, 1392],
 [418, 1391, 1392, 419],
 [270, 2],
 [270, 2, 886],
 [270, 2, 886, 33],
 [270, 2, 886, 33, 196],
 [270, 2, 886, 33, 196, 298],
 [81, 72],
 [81, 72, 38],
 [81, 72, 38, 32],
 [81, 72, 38, 32, 196],
 [81, 72, 38, 32, 196, 336],
 [81, 72, 38, 32, 196, 336, 131],
 [10, 270],
 [10, 270, 886],
 [10, 270, 886, 80],
 [10, 270, 886, 80, 10],
 [10, 270, 886, 80, 10, 1393],
 [128, 72],
 [128, 72, 1],
 [128, 72, 1, 1394],
 [128, 72, 1, 1394, 1395],
 [128, 72, 1, 1394, 1395, 84],
 [72, 1],
 [72, 1, 1396],
 [72, 1, 1396, 365],
 [72, 1, 1396, 365, 2],
 [72, 1, 1396, 365, 2, 887],
 [135, 7],
 [135, 7, 36],
 [135, 7, 36, 16],
 [135, 7, 36, 16, 172],
 [135, 7, 36, 16, 172, 1],
 [135, 7, 36, 16, 172, 1, 299],
 [135, 7, 36, 16, 172, 1, 299, 4],
 [135, 7, 36, 16, 172, 1, 299, 4, 666],
 [81, 76],
 [81, 76, 1],


In [35]:
## Pad Sequesnces
max_sequence_length = max([len(x) for x in inputsequences])
max_sequence_length

14

In [36]:
input_sequences = np.array(pad_sequences(inputsequences, maxlen=max_sequence_length, padding='pre'))
input_sequences


array([[   0,    0,    0, ...,    0,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       [   0,    0,    0, ...,  885,    4,   41],
       ...,
       [   0,    0,    0, ..., 3552,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       [   0,    0,    0, ...,  885,    4,   41]])

In [37]:
##create predictors and label
import tensorflow as tf
X, y = input_sequences[:, :-1], input_sequences[:, -1]
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4],
       ...,
       [   0,    0,    0, ...,    0, 3552,    1],
       [   0,    0,    0, ..., 3552,    1,  885],
       [   0,    0,    0, ...,    1,  885,    4]])

In [38]:
y

array([885,   4,  41, ..., 885,   4,  41])

In [39]:
y=tf.keras.utils.to_categorical(y, num_classes=total_words)


In [40]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [42]:
##train 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length -1))
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))#softmax activation for multi-class classification

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 13, 100)           355300    
                                                                 
 lstm_4 (LSTM)               (None, 13, 150)           150600    
                                                                 
 dropout_2 (Dropout)         (None, 13, 150)           0         
                                                                 
 lstm_5 (LSTM)               (None, 150)               180600    
                                                                 
 dense_2 (Dense)             (None, 3553)              536503    
                                                                 
Total params: 1223003 (4.67 MB)
Trainable params: 1223003 (4.67 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [45]:
history = model.fit(x_train, y_train, epochs=50, verbose =1, validation_data=(x_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [50]:
def predict_next_word(model, tokenizer, text, max_sequence_length):
    # Tokenize the input text
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    if len(token_list) > max_sequence_length:
        token_list = token_list[-(max_sequence_length - 1):]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=-1)[0]
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [51]:
input_text = "Out, out brief candle! Life's but a walking shadow, a poor player"
print("Input text:", input_text)
max_sequence_length = model.input_shape[1]+1
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print("Predicted next word:", predicted_word)

Input text: Out, out brief candle! Life's but a walking shadow, a poor player
Predicted next word: aright


In [52]:
#
model.save('macbeth_word_predictor.h5')

with open ('tokenizer.pkl', 'wb') as f:
    import pickle
    pickle.dump(tokenizer, f,protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(
