In [1]:
import pandas as pd

In [2]:
#Data collection

import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd


#load the dataset
data=gutenberg.raw('shakespeare-hamlet.txt')

#save to a file
with open('hamlet.txt','w') as file:
    file.write(data)


[nltk_data] Downloading package gutenberg to C:\Users\Sneha
[nltk_data]     Chaudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

#load dataset
with open('hamlet.txt','r') as file:
    text=file.read().lower()

#tokenize the text-creating index for word
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words    


4818

In [7]:
#create input sequences
input_sequences = []

for line in text.split('\n'):                      # iterate over each line in your dataset
    token_list = tokenizer.texts_to_sequences([line])[0]  
    # convert words in the line to their integer tokens

    for i in range(1, len(token_list)):            # build subsequences of increasing length
        n_gram_sequence = token_list[:i+1]         # take the first (i+1) tokens
        input_sequences.append(n_gram_sequence)    # add it to the list


In [None]:
input_sequences


In [None]:
#pad sequences
max_sequence_len=max([len(x) for x in input_sequences])  #calculating the maximum sequence length from your input_sequences.
max_sequence_len  #length you’ll use to pad all sequences so they match.

In [None]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

In [None]:
# create predictiors and label

import tensorflow as tf
x = input_sequences[:, :-1]   # all tokens except the last one
y = input_sequences[:, -1]    # the last token (label)


In [None]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)  #converted y into a one-hot encoded matrix
y 


In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
# Train our LSTM and RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout


##Define the model
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [None]:
model.summary()

In [None]:
#train the model

history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

In [None]:
#prediction

def predict_next_word(model,tokenizer,text,max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0] 

    if len(token_list)>=max_sequence_len:
        token_list=token_list[-(max_sequence_len-1):] #sequence is longer than max_sequence_len, keep only the last max_sequence_len-1 tokens

    token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')

    predicted=model.predict(token_list,verbose=0)

    predicted_word_index=np.argmax(predicted,axis=1) #returns the index of the maximum probability along axis 1 
    
    for word,index in tokenizer.word_index.items():
        if index==predicted_word_index:
            return word
    return None     

In [None]:
input_text="Mar.Horatio saies, 'tis but our"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next word prediction:{next_word}")

In [None]:


# Save the model (TensorFlow native format, no h5py needed)
model.save("next_word_lstm")
  # creates a folder

import pickle
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
