In [1]:

import numpy as np

In [2]:
np.random.seed(42)

# Load the Data

In [3]:
book_text = open('Pride_and_Prejudice.txt', encoding='utf8').read()

FileNotFoundError: [Errno 2] No such file or directory: 'Pride_and_Prejudice.txt'

In [7]:
len(book_text)

684781

# Build Tokenizer

In [8]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters


In [9]:
t = Tokenizer(char_level=True)

In [10]:
t.fit_on_texts(book_text)

Number of unique characters

In [12]:
vocab_size = len(t.word_index)

In [13]:
vocab_size

78

In [14]:
t.word_index

{'\n': 15,
 ' ': 1,
 '!': 48,
 '(': 67,
 ')': 68,
 '*': 61,
 ',': 21,
 '-': 32,
 '.': 24,
 '0': 76,
 '1': 64,
 '2': 66,
 '3': 70,
 '4': 71,
 '5': 69,
 '6': 72,
 '7': 74,
 '8': 73,
 '9': 75,
 ':': 58,
 ';': 31,
 '?': 49,
 'A': 50,
 'B': 33,
 'C': 43,
 'D': 46,
 'E': 39,
 'F': 57,
 'G': 56,
 'H': 41,
 'I': 27,
 'J': 52,
 'K': 60,
 'L': 38,
 'M': 30,
 'N': 53,
 'O': 54,
 'P': 55,
 'R': 59,
 'S': 47,
 'T': 37,
 'U': 63,
 'V': 65,
 'W': 42,
 'Y': 51,
 'Z': 77,
 '_': 36,
 'a': 4,
 'b': 23,
 'c': 16,
 'd': 11,
 'e': 2,
 'f': 18,
 'g': 20,
 'h': 8,
 'i': 7,
 'j': 45,
 'k': 26,
 'l': 12,
 'm': 14,
 'n': 6,
 'o': 5,
 'p': 22,
 'q': 44,
 'r': 10,
 's': 9,
 't': 3,
 'u': 13,
 'v': 25,
 'w': 19,
 'x': 35,
 'y': 17,
 'z': 34,
 '‘': 62,
 '’': 40,
 '“': 28,
 '”': 29,
 '\ufeff': 78}

Convert characters to Numbers

In [15]:
book_num = t.texts_to_sequences(book_text)

In [16]:
number_chars = len(book_num)

In [17]:
number_chars

684781

# Build Input and Output

In [18]:
sequence_length = 100 

Input and output container
- Input data will have sequences with 100 characters
- Output data will have one character which comes after 100 characters in the input data

In [33]:
input_data = []

In [34]:
output_data = []

In [35]:
for i in range(0, number_chars - sequence_length):
    input_seq = book_num[i : i + sequence_length]
    output_seq = book_num[i + sequence_length]
    input_data.append(input_seq)
    output_data.append(output_seq)

In [36]:
output_data[5]

[1]

Reshape and Normalize the input

In [37]:
input_data = np.reshape(input_data, (len(input_data),sequence_length,1))

In [38]:
input_data.shape

(684681, 100, 1)

In [39]:
input_data = input_data / vocab_size

One hot encode the output

In [40]:
from tensorflow.python.keras.utils import to_categorical

In [41]:
output_data = to_categorical(output_data,num_classes=vocab_size)

In [42]:
output_data[0:1]

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# Build the Model

In [43]:
from tensorflow.python.keras.models import Sequential

In [44]:
from tensorflow.python.keras.layers import LSTM, Dense, Dropout

In [45]:
model = Sequential()

In [46]:
model.add(LSTM(128, input_shape=(input_data.shape[1],input_data.shape[2])))

In [47]:
model.add(Dropout(0.2))

In [48]:
model.add(Dense(vocab_size, activation='softmax'))

In [49]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

# Execute the model

Goal of the model is to minimize the loss

In [None]:
model.fit(input_data, output_data, batch_size=128, epochs=20)

Epoch 1/20
113664/684681 [===>..........................] - ETA: 42:29 - loss: 3.1410

In [None]:
model.save(<file_name_with_path)

# Build randon Starting point for predicting

In [None]:
start = np.random.randint(0, input_data.shape[0]-1)

In [None]:
start

In [None]:
data = book_num[start: start+sequence_length]
data = [item for sublist in data for item in sublist]

Build Int to Char routine

In [None]:
int_to_char = dict((i,c) for c, i in t.word_index.items())

In [None]:
int_to_char

Start Predicting String

In [None]:
print ('STARTING DATA: ')
print(''.join(int_to_char[char_val] for char_val in data))
print ('\nPREDICTED: ')

for i in range(100):
    #Predict for initial data
    prediction = model.predict(np.reshape(data,(1, len(data), 1))/vocab_size)
    
    #Get char with max probability
    char_index_predicted = np.argmax(prediction)
    
    #convert index to char
    char_predicted = int_to_char[char_index_predicted]
    
    print (char_predicted, end='')
    
    #Change data - append new char index and remove the first index
    data.append(char_index_predicted)
    data = data[1:len(data)]   

# Loading a trained Model

In [None]:
from tensorflow.python.keras.models import load_model

In [None]:
model = load_model('char_book.hd5')

In [None]:
prediction = model.predict(np.reshape(data,(1, len(data), 1))/vocab_size)

In [None]:
prediction.shape

In [None]:
np.argmax(prediction)

In [None]:
int_to_char[2]