In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
from nltk.corpus import gutenberg
import pickle

In [3]:
nltk.download('gutenberg')
nltk.download('punkt')
data = gutenberg.raw('shakespeare-hamlet.txt')

[nltk_data] Downloading package gutenberg to C:\Users\Gangotri
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Gangotri
[nltk_data]     Mishra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# CONVERT SENTENCE INTO TOKENS
from nltk.tokenize import word_tokenize
data_token = word_tokenize(data)

In [5]:
# CONVERT TOKENS INTO INTEGER
from tensorflow.keras.preprocessing.text import Tokenizer
data_tokenizer = Tokenizer()

# It create the vocabulary from the data that we are sharing
# and count the frequency of each word
# Building Frequency Dictionary
data_tokenizer.fit_on_texts([data_token])

#  After the vocabulary has been built using fit_on_texts, this method takes new texts (or the same ones)
#  and replaces each token with its corresponding integer based on the vocabulary
sequences = data_tokenizer.texts_to_sequences([data_token])

In [6]:
with (open('data_tokenizer.pkl','wb')) as file:
    pickle.dump(data_tokenizer,file)

In [5]:
with open('data.txt','w') as file:
  file.write(data)

with open('data.txt','r') as file:
  text = file.read().lower()

In [6]:
## Here we will create sequential compbination of all words present in a sentence for all the sentences
line_seq=[]
for sentence in text.split('\n'):
  sent_seq = data_tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(sent_seq)):
    n_gram = sent_seq[:i+1]
    line_seq.append(n_gram)

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(x) for x in line_seq)
pad_seq = pad_sequences(line_seq, maxlen= max_len, padding= 'pre')

## Its an array
pad_seq.shape

(25227, 14)

In [9]:
# x: creating independent feature
# y: creating dependent feature
x, y = pad_seq[:,:-1], pad_seq[:,-1]

# total unique words
total_words = len(data_tokenizer.word_index)

# converting all values in dependet feature values in categories (one hotencode)
y = tf.keras.utils.to_categorical(y, num_classes= total_words)

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state= 42, test_size= 0.2)

In [7]:
## Tran LSTM

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout,GRU
## We can add GRU also instead of LSTM or with wil LSTM to0

## Specify the total words : which mean defnining total number of different words we have in voc
model = Sequential()
model.add(Embedding(total_words, 100, input_length= x_train.shape[1]))
## Use return_sequences= True: The LSTM layer returns the output for each time step in the input sequence, when you need the outputs for each time step. 
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
## return_sequences=False (default): This is typically used for tasks where you only need the final output, such as classification tasks.
model.add(LSTM(150))
model.add(Dense(total_words, activation= 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

NameError: name 'total_words' is not defined

In [46]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 13, 100)           480700    
_________________________________________________________________
lstm_11 (LSTM)               (None, 13, 150)           150600    
_________________________________________________________________
dropout_4 (Dropout)          (None, 13, 150)           0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 150)               180600    
_________________________________________________________________
dense_4 (Dense)              (None, 4807)              725857    
Total params: 1,537,757
Trainable params: 1,537,757
Non-trainable params: 0
_________________________________________________________________


In [50]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor = 'Val_loss', patience = 5, restore_best_weights=True)

In [49]:
history = model.fit(x_train, y_train, epochs= 10, validation_data= (x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [141]:
st = 'lets go to the'
def preprocessing_predict(inp):
    ## we have to provide input string inside bracket otherwise data_tokenizer will treat each word separately and create token in separate lists
    inp_seq = data_tokenizer.texts_to_sequences([inp])[0]
    
    if len(inp_seq) >= max_len:
        inp_seq = inp_seq[-(max_len-1):]

    ## in pad_sequence we do padding for multiple lists together that's why we put them in backet, so right now we have one list only but we will still put them in bracket
    padded_in= pad_sequences([inp_seq],  padding= 'pre', maxlen= max_len-1)
    pred = model.predict(padded_in)

    ## getting maximum value which represents the most probable word
    pred = np.argmax(pred, axis=1)

    for word, index in data_tokenizer.word_index.items():
        if index == pred:
            return word
    return None

In [144]:
print(preprocessing_predict(st))

king


In [145]:
model.save('LSTM_predict.h5')