In [139]:
%load_ext autoreload 
%autoreload 2
%matplotlib inline

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
import keras.utils
import numpy as np
import random
import sys
import io
import os
import re
import itertools
from collections import Counter

from src.read_data import read_trump_speeches
from src.utils import print_tensorflow_devices
from src.data_generator import DataGenerator
print_tensorflow_devices()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[name: "/cpu:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11937138602351502381
]


In [11]:
# Parameters
seq_len = 21 # includes next word.

# NN parameters
batch_size = 64

examples_file_loc = 'examples/examples.txt'

In [78]:
speeches = read_trump_speeches('data/speeches.txt')
words = np.unique(speeches)
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))
n_words = len(words)

speeches_indexed = [word_index[x] for x in speeches]
sentence_ranges = [range(i,i+seq_len) for i in range(0,len(speeches)-seq_len,step)]
sentences = [[speeches[y] for y in x] for x in sentence_ranges]
sentences_indexed = [[speeches_indexed[y] for y in x] for x in sentence_ranges]

In [129]:
# Function from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Function modified from https://github.com/enriqueav/lstm_lyrics/blob/master/lstm_train.py
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed = (sentences_indexed_test)[np.random.randint(len(sentences_indexed_test))]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('\n----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join([index_word[x] for x in sentence]) + '"\n')

        sentence = sentence.copy()
        full_sentence = sentence.copy()
        x_pred = np.zeros((batch_size, seq_len-1), dtype=np.int)
        
        for i in range(50):
            x_pred[0,] = sentence[:-1]
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = next_index
            sentence = sentence[1:]
            sentence.append(next_word)
            full_sentence.append(next_word)
            
        full_sentence = [index_word[x] for x in full_sentence]
        examples_file.write(' '.join(full_sentence))
    examples_file.write('\n' + '='*80 + '\n\n')
    examples_file.flush()

In [134]:
# Train test split
random.shuffle(sentences_indexed)
train_split = int(0.95*len(sentences_indexed))
sentences_indexed_train = sentences_indexed[:train_split]
sentences_indexed_test = sentences_indexed[train_split:]
print('Train: ' + str(len(sentences_indexed_train)))
print('Test: ' +str(len(sentences_indexed_test)))

Train: 176350
Test: 9282


In [135]:
def get_model():
    print('Build model...')
    model = Sequential()
    model.add(Embedding(n_words, 50, input_length=seq_len-1))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_words, activation='softmax'))
    return model

In [141]:
model = get_model()
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [print_callback, early_stopping]

examples_file = open(examples_file_loc, "w")

model.fit_generator(DataGenerator(sentences_indexed_train, seq_len, n_words, batch_size),
                    steps_per_epoch=int(len(sentences_indexed_train)/batch_size) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=DataGenerator(sentences_indexed_test, seq_len, n_words, batch_size),
                    validation_steps=int(len(sentences_indexed_test)/batch_size) + 1)

Build model...
Epoch 1/100
  63/2756 [..............................] - ETA: 810s - loss: 6.6540 - acc: 0.0687

KeyboardInterrupt: 