### Load the data

In [0]:
import numpy as np
np.random.seed(42)

Download data from Project Gutenberg site -> http://www.gutenberg.org/files/1342/1342-0.txt

In [0]:
#Download book
!wget -O Pride_and_Prejudice.txt http://www.gutenberg.org/files/1342/1342-0.txt --quiet

#Read it as string
book_text = open('Pride_and_Prejudice.txt', encoding='utf8').read()
len(book_text)

In [0]:
book_text[0:500]

### Data Preprocessing

1.Tokenize the data at character level

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

#Tokenize at character level
t = Tokenizer(char_level=True)

#Fit tokenizer on the book
t.fit_on_texts(book_text)

#Vocablury size
vocab_size = len(t.word_index)

print('Number of unique characters: ', vocab_size)

#Convert characters in the book to Numbers
book_num = t.texts_to_sequences(book_text)

number_chars = len(book_num)

In [0]:
t.word_index

## Build Input and Output

Input and output container
- Input data will have sequences with 100 characters
- Output data will have one character which comes after 100 characters in the input data

In [0]:
sequence_length = 100 #Length of input sequence

#Empty list for input and output data
input_data = []
output_data = []

#Populate input and output data
for i in range(0, number_chars - sequence_length):
    #input sequence
    input_seq = book_num[i : i + sequence_length]
    #Output sequence
    output_seq = book_num[i + sequence_length]
    
    input_data.append(input_seq)
    output_data.append(output_seq)

In [0]:
print(len(input_data))
print("Input Data\n:",input_data[10])
print("Output Data\n:",output_data[10])

Reshape and Normalize the input

In [0]:
input_data = np.reshape(input_data, (len(input_data),sequence_length,1))
input_data = input_data / vocab_size

One hot encode the output

In [0]:
from tensorflow.python.keras.utils import to_categorical
output_data = to_categorical(output_data,num_classes=vocab_size+1)

In [0]:
print(output_data[10])

In [0]:
print(input_data.shape[1])

In [0]:
print(input_data.shape[2])

In [0]:
print(input_data.shape)

# Build the Model

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM, Dense, Dropout
from tensorflow.python.keras.callbacks import LambdaCallback

#Build the Model
model = Sequential()
model.add(LSTM(256, input_shape=(input_data.shape[1],input_data.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(vocab_size+1, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy') #No accuracy tracking here

# Execute the model

In [0]:
#Identify a random sequence which we will use to generate output
test_seq =  input_data[np.random.randint(0, high=input_data.shape[0])]

In [0]:
#Build a dictionary which can convert numbers into chars
int_to_char = dict((i,c) for c, i in t.word_index.items()) 

In [0]:
def predict_seq(epoch, logs):
    
    print('Output sequence is: ')
    
    #Initialize predicted output
    predicted_output = ''
    
    #lets predict 50 next chars
    current_seq = np.copy(test_seq)
    for i in range(50):
        data_input = np.reshape(current_seq,(1,
                                             current_seq.shape[0], 
                                             current_seq.shape[1]))
        
        #Get the char int with maximum probability
        predicted_char_int = np.argmax(model.predict(data_input)[0])
        
        #Add to the predicted out, convert int to char
        predicted_output = predicted_output + int_to_char[predicted_char_int]
        
        #Update seq with new value at the end
        current_seq = np.roll(current_seq, -1)
        current_seq[current_seq.shape[0]-1] = [predicted_char_int/vocab_size]
    
    print(predicted_output)

In [0]:
#Create a LabdaCallback to do prediction at end of every epoch
checkpoint = LambdaCallback(on_epoch_end=predict_seq)

In [0]:
#Print random starting sequence for prediction
print('Initial sequence is: ')
for i in range (sequence_length):
    print(int_to_char[int(test_seq[i]*vocab_size)], end='')

In [0]:
model.fit(input_data, output_data, 
          batch_size=128, 
          epochs=50,
          callbacks=[checkpoint])