In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.layers import Bidirectional, GRU
import pickle
import numpy as np
import os




In [2]:
file =open('ML_text_file_test.txt', "r", encoding = "utf8")
# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines)

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces
data = data.split()
data = ' '.join(data)
data[:500]

'It is a truth universally acknowledged, that a single man in possession of a good fortune must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters. My dear Mr. Bennet, said his lady to him one day, have you heard that Netherfield Park is let at last? Mr. Bennet replied '

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]
len(sequence_data)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])

X = np.array(X)
y = np.array(y)
print("Data: ", X[:10])
print("Response: ", y[:10])
y = to_categorical(y, num_classes=vocab_size)
y[:5]

1031
The Length of sequences are:  4493
Data:  [[ 18  21   5]
 [ 21   5 291]
 [  5 291 448]
 [291 448 292]
 [448 292  17]
 [292  17   5]
 [ 17   5 171]
 [  5 171  59]
 [171  59  10]
 [ 59  10 293]]
Response:  [291 448 292  17   5 171  59  10 293   3]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [4]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             10310     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 1031)              1032031   
                                                                 
Total params: 14091341 (53.75 MB)
Trainable params: 14091341 (53.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [5]:
model_BLSTM = Sequential()
model_BLSTM.add(Embedding(vocab_size, 10, input_length=3))
model_BLSTM.add(Bidirectional(LSTM(1000, return_sequences=True)))
model_BLSTM.add(Bidirectional(LSTM(1000)))
model_BLSTM.add(Dense(1000, activation="relu"))
model_BLSTM.add(Dense(vocab_size, activation="softmax"))
model_BLSTM.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             10310     
                                                                 
 bidirectional (Bidirection  (None, 3, 2000)           8088000   
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 2000)              24008000  
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 1000)              2001000   
                                                                 
 dense_3 (Dense)             (None, 1031)              1032031   
                                                                 
Total params: 35139341 (134.05 MB)
Trainable params: 3

In [6]:
model_GRU = Sequential()
model_GRU.add(Embedding(vocab_size, 10, input_length=3))
model_GRU.add(GRU(1000, return_sequences=True))
model_GRU.add(GRU(1000))
model_GRU.add(Dense(1000, activation="relu"))
model_GRU.add(Dense(vocab_size, activation="softmax"))
model_GRU.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 3, 10)             10310     
                                                                 
 gru (GRU)                   (None, 3, 1000)           3036000   
                                                                 
 gru_1 (GRU)                 (None, 1000)              6006000   
                                                                 
 dense_4 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_5 (Dense)             (None, 1031)              1032031   
                                                                 
Total params: 11085341 (42.29 MB)
Trainable params: 11085341 (42.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("Model_LSTM_Predictor.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70

Epoch 1: loss improved from inf to 6.24377, saving model to Model_LSTM_Predictor.h5
Epoch 2/70
 1/71 [..............................] - ETA: 4s - loss: 5.9645

  saving_api.save_model(


Epoch 2: loss improved from 6.24377 to 5.88535, saving model to Model_LSTM_Predictor.h5
Epoch 3/70
Epoch 3: loss improved from 5.88535 to 5.84908, saving model to Model_LSTM_Predictor.h5
Epoch 4/70
Epoch 4: loss improved from 5.84908 to 5.81439, saving model to Model_LSTM_Predictor.h5
Epoch 5/70
Epoch 5: loss improved from 5.81439 to 5.73045, saving model to Model_LSTM_Predictor.h5
Epoch 6/70
Epoch 6: loss improved from 5.73045 to 5.58810, saving model to Model_LSTM_Predictor.h5
Epoch 7/70
Epoch 7: loss improved from 5.58810 to 5.37205, saving model to Model_LSTM_Predictor.h5
Epoch 8/70
Epoch 8: loss improved from 5.37205 to 5.13612, saving model to Model_LSTM_Predictor.h5
Epoch 9/70
Epoch 9: loss improved from 5.13612 to 4.93257, saving model to Model_LSTM_Predictor.h5
Epoch 10/70
Epoch 10: loss improved from 4.93257 to 4.72544, saving model to Model_LSTM_Predictor.h5
Epoch 11/70
Epoch 11: loss improved from 4.72544 to 4.54743, saving model to Model_LSTM_Predictor.h5
Epoch 12/70
Epoch

<keras.src.callbacks.History at 0x2fabedfbd50>

In [8]:
checkpoint = ModelCheckpoint("Model_BLSTM_Predictor.h5", monitor='loss', verbose=1, save_best_only=True)
model_BLSTM.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model_BLSTM.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.26001, saving model to Model_BLSTM_Predictor.h5
Epoch 2/70
Epoch 2: loss improved from 6.26001 to 5.83287, saving model to Model_BLSTM_Predictor.h5
Epoch 3/70
Epoch 3: loss improved from 5.83287 to 5.69350, saving model to Model_BLSTM_Predictor.h5
Epoch 4/70
Epoch 4: loss improved from 5.69350 to 5.49005, saving model to Model_BLSTM_Predictor.h5
Epoch 5/70
Epoch 5: loss improved from 5.49005 to 5.34335, saving model to Model_BLSTM_Predictor.h5
Epoch 6/70
Epoch 6: loss improved from 5.34335 to 5.19658, saving model to Model_BLSTM_Predictor.h5
Epoch 7/70
Epoch 7: loss improved from 5.19658 to 5.03841, saving model to Model_BLSTM_Predictor.h5
Epoch 8/70
Epoch 8: loss improved from 5.03841 to 4.83637, saving model to Model_BLSTM_Predictor.h5
Epoch 9/70
Epoch 9: loss improved from 4.83637 to 4.67939, saving model to Model_BLSTM_Predictor.h5
Epoch 10/70
Epoch 10: loss improved from 4.67939 to 4.41236, saving model to Model_BLSTM_Predictor.h5
Ep

<keras.src.callbacks.History at 0x2fabed66a10>

In [9]:
checkpoint = ModelCheckpoint("Model_GRU_Predictor.h5", monitor='loss', verbose=1, save_best_only=True)
model_GRU.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model_GRU.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.22250, saving model to Model_GRU_Predictor.h5
Epoch 2/70
Epoch 2: loss improved from 6.22250 to 5.88725, saving model to Model_GRU_Predictor.h5
Epoch 3/70
Epoch 3: loss improved from 5.88725 to 5.74906, saving model to Model_GRU_Predictor.h5
Epoch 4/70
Epoch 4: loss improved from 5.74906 to 5.54754, saving model to Model_GRU_Predictor.h5
Epoch 5/70
Epoch 5: loss improved from 5.54754 to 5.29723, saving model to Model_GRU_Predictor.h5
Epoch 6/70
Epoch 6: loss improved from 5.29723 to 5.07440, saving model to Model_GRU_Predictor.h5
Epoch 7/70
Epoch 7: loss improved from 5.07440 to 4.84909, saving model to Model_GRU_Predictor.h5
Epoch 8/70
Epoch 8: loss improved from 4.84909 to 4.58155, saving model to Model_GRU_Predictor.h5
Epoch 9/70
Epoch 9: loss improved from 4.58155 to 4.32797, saving model to Model_GRU_Predictor.h5
Epoch 10/70
Epoch 10: loss improved from 4.32797 to 4.05134, saving model to Model_GRU_Predictor.h5
Epoch 11/70
Epoch 11: 

<keras.src.callbacks.History at 0x2fb0d170bd0>