### Load tensorflow

In [1]:
import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(42)

### Collect Data
<font size="2">Download data from Project Gutenberg site -> http://www.gutenberg.org/files/1342/1342-0.txt</font>

In [None]:
!wget http://www.gutenberg.org/files/1342/1342-0.txt --quiet

In [2]:
book_text = open('1342-0.txt', encoding='utf8').read() #reading the book as a string
print('Length of the book: ' , len(book_text))

Length of the book:  704190


In [3]:
type(book_text)

str

In [4]:
book_text[1000:1500]

'ful property\nof some one or other of their daughters.\n\n“My dear Mr. Bennet,” said his lady to him one day, “have you heard that\nNetherfield Park is let at last?”\n\nMr. Bennet replied that he had not.\n\n“But it is,” returned she; “for Mrs. Long has just been here, and she\ntold me all about it.”\n\nMr. Bennet made no answer.\n\n“Do you not want to know who has taken it?” cried his wife impatiently.\n\n“_You_ want to tell me, and I have no objection to hearing it.”\n\nThis was invitation enough.\n\n“Why, my de'

### Tokenize the data

In [5]:
#Tokenize at character level
t = tf.keras.preprocessing.text.Tokenizer(char_level=True, lower=False)

#Fit tokenizer on the book
t.fit_on_texts(book_text)

#Vocablury size
vocab_size = len(t.word_index)

print('Number of unique characters: ', vocab_size)

Number of unique characters:  86


In [6]:
#Character Vocabluty
print(t.word_index)

{' ': 1, 'e': 2, 't': 3, 'a': 4, 'o': 5, 'n': 6, 'i': 7, 'h': 8, 'r': 9, 's': 10, 'd': 11, 'l': 12, 'u': 13, '\n': 14, 'm': 15, 'c': 16, 'y': 17, 'f': 18, 'w': 19, 'g': 20, ',': 21, 'p': 22, 'b': 23, '.': 24, 'v': 25, 'k': 26, 'I': 27, '“': 28, '”': 29, 'M': 30, ';': 31, '-': 32, 'B': 33, 'z': 34, 'T': 35, 'x': 36, 'E': 37, '_': 38, 'L': 39, "'": 40, 'H': 41, 'C': 42, 'W': 43, 'j': 44, 'q': 45, 'D': 46, 'S': 47, 'A': 48, '!': 49, '?': 50, 'Y': 51, 'J': 52, 'P': 53, 'N': 54, 'G': 55, 'O': 56, 'F': 57, 'R': 58, ':': 59, 'K': 60, '1': 61, 'U': 62, '*': 63, '(': 64, ')': 65, '2': 66, '3': 67, '4': 68, '0': 69, 'V': 70, '5': 71, '/': 72, '8': 73, '6': 74, '9': 75, '7': 76, 'Z': 77, 'X': 78, '@': 79, '$': 80, '\ufeff': 81, '[': 82, '#': 83, ']': 84, '%': 85, 'Q': 86}


In [7]:
#Convert characters in the book to Numbers
book_num = t.texts_to_sequences(book_text)
print(book_num[1000:1050])

[[18], [13], [12], [1], [22], [9], [5], [22], [2], [9], [3], [17], [14], [5], [18], [1], [10], [5], [15], [2], [1], [5], [6], [2], [1], [5], [9], [1], [5], [3], [8], [2], [9], [1], [5], [18], [1], [3], [8], [2], [7], [9], [1], [11], [4], [13], [20], [8], [3], [2]]


In [8]:
#Build a dictionary which can convert numbers into chars
int_to_char = dict((i,c) for c, i in t.word_index.items()) 

In [9]:
int_to_char[15]

'm'

### Prepare Input and Output Sequences

Input and output container
- Input data will have sequences with 100 characters
- Output data will have one character which comes after 100 characters in the input data

In [10]:
sequence_length = 100 #Length of input sequence

#Empty list for input and output data
input_data = []  #Empty list for input data
output_data = [] #Empty list for output data

#Populate input and output data
for i in range(0, len(book_num) - sequence_length):
    
    input_seq = book_num[i : i + sequence_length] #input sequence    
    output_seq = book_num[i + sequence_length] #Output sequence
    
    input_data.append(input_seq)
    output_data.append(output_seq)

In [11]:
print('Total number of input arrays: ', len(input_data))
print('Total number of Output arrays: ', len(output_data))
print("Input Data length: ",len(input_data[10]))
print("Output Data length: ",len(output_data[10]))

Total number of input arrays:  704090
Total number of Output arrays:  704090
Input Data length:  100
Output Data length:  1


bool_example = 'I am home'
example_size = 5  #Num of chars in each example
1 Example = 'I am '
2nd - ' am h'
3rd - 'am ho'
4th = 'm hom'
5 - ' home'

finally 5 examples
9 - 5 + 1

704190 - 100 = 704090 Examples

In [12]:
output_data[10]

[2]

In [13]:
input_data[10]

[[16],
 [3],
 [1],
 [55],
 [13],
 [3],
 [2],
 [6],
 [23],
 [2],
 [9],
 [20],
 [1],
 [37],
 [33],
 [5],
 [5],
 [26],
 [1],
 [5],
 [18],
 [1],
 [53],
 [9],
 [7],
 [11],
 [2],
 [1],
 [4],
 [6],
 [11],
 [1],
 [53],
 [9],
 [2],
 [44],
 [13],
 [11],
 [7],
 [16],
 [2],
 [21],
 [1],
 [23],
 [17],
 [1],
 [52],
 [4],
 [6],
 [2],
 [1],
 [48],
 [13],
 [10],
 [3],
 [2],
 [6],
 [14],
 [14],
 [35],
 [8],
 [7],
 [10],
 [1],
 [2],
 [33],
 [5],
 [5],
 [26],
 [1],
 [7],
 [10],
 [1],
 [18],
 [5],
 [9],
 [1],
 [3],
 [8],
 [2],
 [1],
 [13],
 [10],
 [2],
 [1],
 [5],
 [18],
 [1],
 [4],
 [6],
 [17],
 [5],
 [6],
 [2],
 [1],
 [4],
 [6],
 [17],
 [19],
 [8]]

### One Hot encoding for Input and Output

In [None]:
704090 * 1 * 87 * 4

input_data = 704090 x 100 x 87 = 6,125,583,000
Each integer = 4 Bytes
Number of Bytes = 24,502,332,000 = 24.5 Giga Bytes
Output = 245MB

In [14]:
#Input data one hot encoding
input_data_one_hot = tf.keras.utils.to_categorical(input_data,num_classes=vocab_size+1)

#Output data one hot encoding
output_data = tf.keras.utils.to_categorical(output_data,num_classes=vocab_size+1)

MemoryError: 

- Input = 700K+
- Each Array = 100 Chracter
- 700K*100 = 70M Numbers
- 1 Character = 86 numbers
- Total Numbers - 700K * 100 * 86 = 6,020,000,000
1 integer = 4 Byters
6 B * 4 = 24B Byters = 24 GB

In [18]:
2000 * 100 * 60 * 4

48000000

### Reshaping input data

In [None]:
#Reshape input data into 3 dimensional numpy array
#batch_size x sequence_length x vocab_size+1
input_data_one_hot = np.reshape(input_data_one_hot,
                                (len(input_data_one_hot),
                                 sequence_length,
                                 vocab_size+1))

print(input_data_one_hot.shape)

### Build the Model

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(256, input_shape=(input_data.shape[1],input_data.shape[2])))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(vocab_size+1, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy') #No accuracy tracking here

### Execute the model

In [None]:
#Identify a random sequence which we will use to generate output
test_seq =  input_data[np.random.randint(0, high=input_data.shape[0])]

In [None]:
def predict_seq(epoch, logs):
    
    print('Output sequence is: ')
    
    #Initialize predicted output
    predicted_output = ''
    
    #lets predict 50 next chars
    current_seq = np.copy(test_seq)
    for i in range(50):
        current_seq_one_hot = to_categorical(current_seq, num_classes=vocab_size+1)
        data_input = np.reshape(current_seq_one_hot,(1,
                                                     current_seq_one_hot.shape[0],
                                                     current_seq_one_hot.shape[1]))
        
        #Get the char int with maximum probability
        predicted_char_int = np.argmax(model.predict(data_input)[0])
        
        #Add to the predicted out, convert int to char
        predicted_output = predicted_output + int_to_char[predicted_char_int]
        
        #Update seq with new value at the end
        current_seq = np.roll(current_seq, -1)
        current_seq[current_seq.shape[0]-1] = [predicted_char_int]
    
    print(predicted_output)

In [None]:
#Create a LabdaCallback to do prediction at end of every epoch
checkpoint = LambdaCallback(on_epoch_end=predict_seq)

In [None]:
#Print random starting sequence for prediction
print('Initial sequence is: ')
for i in range (sequence_length):
    print(int_to_char[int(test_seq[i]*vocab_size)], end='')

In [None]:
model.fit(input_data, output_data, 
          batch_size=128, 
          epochs=50,
          callbacks=[checkpoint])