In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

file = open("dataset.txt", "r", encoding = "utf8")
lines = []

In [2]:
for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  The Project Gutenberg EBook of Metamorphosis, by Franz Kafka

The Last Line:  subscribe to our email newsletter to hear about new eBooks.


In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360000]



In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:360000]



In [5]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:360000]



In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:1000000]

[20,
 175,
 2,
 50,
 51,
 176,
 356,
 882,
 883,
 884,
 885,
 357,
 13,
 50,
 29,
 52,
 20,
 30,
 177,
 886,
 178,
 14,
 358,
 15,
 82,
 887,
 888,
 359,
 10,
 360,
 361,
 8,
 362,
 8,
 53,
 83,
 363,
 30,
 179,
 364,
 31,
 365,
 13,
 889,
 54,
 2,
 5,
 84,
 366,
 50,
 890,
 180,
 85,
 891,
 181,
 892,
 11,
 367,
 893,
 176,
 894,
 895,
 357,
 896,
 368,
 897,
 898,
 899,
 50,
 86,
 87,
 369,
 360,
 900,
 901,
 182,
 370,
 371,
 902,
 903,
 904,
 88,
 51,
 13,
 175,
 2,
 50,
 176,
 181,
 55,
 372,
 21,
 89,
 373,
 6,
 32,
 905,
 90,
 906,
 374,
 16,
 907,
 33,
 908,
 375,
 91,
 909,
 376,
 910,
 16,
 911,
 56,
 912,
 57,
 92,
 34,
 913,
 183,
 93,
 377,
 94,
 914,
 184,
 378,
 915,
 916,
 917,
 379,
 380,
 918,
 58,
 381,
 919,
 35,
 920,
 382,
 383,
 921,
 384,
 59,
 95,
 375,
 385,
 185,
 922,
 923,
 924,
 925,
 386,
 36,
 926,
 96,
 927,
 97,
 186,
 98,
 99,
 100,
 187,
 8,
 928,
 387,
 60,
 929,
 388,
 60,
 188,
 61,
 389,
 390,
 930,
 391,
 931,
 932,
 189,
 84,
 392,
 933,
 934,


In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

3068


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:1000000]

The Length of sequences are:  4714


array([[  20,  175],
       [ 175,    2],
       [   2,   50],
       ...,
       [ 169, 3065],
       [3065, 3066],
       [3066, 3067]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 20 175   2  50  51]
The responses are:  [175   2  50  51 176]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             30680     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 3068)              3071068   
Total params: 16,150,748
Trainable params: 16,150,748
Non-trainable params: 0
_________________________________________________________________


In [14]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [15]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

In [16]:
model.fit(x=None, y=None, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

NameError: name 'checkpoint' is not defined

In [None]:
from IPython.display import Image 
pil_img = Image(filename='graph1.png')
display(pil_img)