In [16]:
%load_ext autoreload 
%autoreload 2
%matplotlib inline

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
import keras.utils
import numpy as np
import random
import sys
import io
import os
import re
import itertools
from collections import Counter

from src.read_data import read_trump_speeches
from src.utils import print_tensorflow_devices
print_tensorflow_devices()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17010289459145202023
]


In [11]:
# Parameters
seq_len = 21 # includes next word.

# NN parameters
batch_size = 64

examples_file_loc = 'examples/examples.txt'

In [78]:
speeches = read_trump_speeches('data/speeches.txt')
words = np.unique(speeches)
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))
n_words = len(words)

speeches_indexed = [word_index[x] for x in speeches]
sentence_ranges = [range(i,i+seq_len) for i in range(0,len(speeches)-seq_len,step)]
sentences = [[speeches[y] for y in x] for x in sentence_ranges]
sentences_indexed = [[speeches_indexed[y] for y in x] for x in sentence_ranges]

In [92]:
# modified from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, sentences_indexed, n_words, batch_size=32, shuffle=True):
        'Initialization'
        self.sentences_indexed = sentences_indexed
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_words = n_words
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.sentences_indexed) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # Find list of sentences
        sentences_indexed_temp = [self.sentences_indexed[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(sentences_indexed_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.sentences_indexed))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sentences_indexed_temp):
        'Generates data containing batch_size samples' 
        X = np.zeros((self.batch_size, seq_len-1), dtype=np.int)
        y = np.zeros((self.batch_size, self.n_words), dtype=np.bool)

        # Generate data
        for i, sentence_idx in enumerate(sentences_indexed_temp):
            X[i,] = sentence_idx[:-1]                                             
            y[i, sentence_idx[-1]] = 1

        return X, y

In [94]:
# Function from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Function modified from https://github.com/enriqueav/lstm_lyrics/blob/master/lstm_train.py
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences_test))
    seed = (sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('\n----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')

        sentence = sentence.copy()
        full_sentence = sentence.copy()

        for i in range(50):
            x_pred = np.zeros((1, seq_len-1, len(word_indices)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            sentence = sentence[1:]
            sentence.append(next_word)
            full_sentence.append(next_word)
        examples_file.write(' '.join(full_sentence))
    examples_file.write('\n' + '='*80 + '\n\n')
    examples_file.flush()

In [99]:
# Train test split
random.shuffle(sentences_indexed)
train_split = int(0.95*len(sentences_indexed))
sentences_indexed_train = sentences_indexed[:train_split]
sentences_indexed_test = sentences_indexed[train_split:]
print('Train: ' + str(len(sentences_indexed_train)))
print('Test: ' +str(len(sentences_indexed_test)))

Train: 176350
Test: 9282


In [96]:
def get_model():
    print('Build model...')
    model = Sequential()
    model.add(Embedding(len(word_indices), 50, input_length=seq_len-1))
    model.add(LSTM(100,return_sequences=True))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_words, activation='softmax'))
    return model

In [100]:
model = get_model()
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [print_callback, early_stopping]

examples_file = open(examples_file_loc, "w")

model.fit_generator(DataGenerator(sentences_indexed_train, n_words, batch_size),
                    steps_per_epoch=int(len(sentences_indexed)/batch_size) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=DataGenerator(sentences_indexed_test, n_words, batch_size),
                    validation_steps=int(len(sentences_indexed_test)/batch_size) + 1)

Build model...
Epoch 1/100[[5410, 1063, 4, 2734, 5200, 2969, 5399, 205, 658, 5422, 1610, 5405, 6019, 4985, 3591, 401, 3960, 474, 917, 3121, 2894], [3960, 4, 5888, 2452, 5479, 1148, 5405, 1472, 4, 5888, 2452, 5479, 2405, 2987, 1741, 4, 401, 5479, 2405, 2987, 1741], [2, 401, 1902, 5690, 680, 1404, 919, 5405, 4166, 4, 6020, 2, 2759, 3339, 206, 5056, 401, 1802, 5405, 5056, 2759], [401, 5413, 5848, 3207, 2669, 1185, 586, 401, 5848, 266, 2532, 206, 3674, 5313, 4, 2593, 266, 4783, 3243, 206, 3674], [2, 502, 6026, 3113, 4, 2995, 2432, 5479, 645, 1759, 2759, 1745, 5432, 2987, 1987, 5275, 4, 2656, 2407, 5007, 2330], [2532, 842, 3243, 1998, 3803, 1995, 3820, 1466, 4, 5425, 3066, 4720, 2, 1101, 2, 5492, 5887, 1151, 5690, 5405, 2402], [1494, 4, 4985, 2759, 5432, 5887, 2432, 5479, 645, 3561, 2827, 4, 2759, 5432, 5887, 2432, 5479, 645, 3561, 5653, 401], [5812, 4, 2441, 2556, 4, 2995, 2441, 2556, 4, 5405, 3960, 3002, 1475, 5946, 401, 6026, 3113, 2, 2759, 4396, 2248], [3713, 2, 3719, 3776, 2, 5422, 328

ValueError: Error when checking target: expected dense_4 to have 3 dimensions, but got array with shape (64, 6051)

[401, 919, 5405, 5845, 2, 5848, 2589, 206, 3304, 3746, 2443, 3960, 4, 5848, 2589, 1980, 4, 3688, 2969, 4396, 5848]
[401, 2691, 3337, 2, 3287, 538, 5421, 3722, 2773, 2759, 1626, 4665, 2, 3000, 4198, 5989, 2589, 2182, 433, 658, 6026]
[2901, 4, 5897, 2759, 1624, 5405, 497, 3746, 5405, 1472, 2, 2759, 5432, 3776, 3746, 5405, 4399, 2987, 5828, 4985, 5218]
[4, 912, 2759, 5990, 1987, 5832, 3608, 5469, 5946, 5438, 3776, 4, 2759, 5989, 929, 5690, 5405, 2595, 3746, 2277, 2]
[1996, 334, 5479, 4, 5848, 367, 4426, 3803, 1861, 5007, 907, 3291, 3381, 3746, 5410, 266, 401, 1513, 3339, 5676, 3803]
[2, 2995, 2432, 5479, 645, 3243, 3842, 2, 4170, 3591, 3842, 4, 658, 5887, 2432, 5479, 1708, 2483, 4, 4, 1887]
[658, 2593, 5989, 2589, 1185, 586, 4, 401, 919, 5405, 5845, 2, 5048, 3746, 5399, 2, 6026, 3113, 2, 2759, 5828]
[5405, 5749, 401, 3003, 5972, 2566, 2268, 5405, 5749, 4, 401, 502, 6026, 943, 4778, 2, 3003, 2415, 206, 3304, 3746]
[3430, 4, 6038, 2432, 5479, 4720, 2, 3759, 2, 2405, 2669, 5405, 2628, 3829, 

In [57]:
x = np.zeros((5,5))

In [62]:
[1,2,3,4,5][:-1]

[1, 2, 3, 4]

In [61]:
x

array([[ 1.,  2.,  3.,  4.,  5.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])