In [19]:
%load_ext autoreload 
%autoreload 2

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import Tokenizer
import keras.utils
import numpy as np
import random
import sys
import io
import os
import re
import itertools
from collections import Counter

from src.read_data import read_trump_speeches
from src.utils import print_tensorflow_devices
from src.data_generator import DataGenerator
print_tensorflow_devices()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14858044814870719890
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3177234432
locality {
  bus_id: 1
  links {
  }
}
incarnation: 708338030393306804
physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


In [11]:
# Parameters
seq_len = 21 # includes next word.

# NN parameters
batch_size = 128

examples_file_loc = 'examples/examples.txt'

In [12]:
speeches = read_trump_speeches('data/speeches.txt')
words = np.unique(speeches)
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))
n_words = len(words)

speeches_indexed = [word_index[x] for x in speeches]
sentence_ranges = [range(i,i+seq_len) for i in range(0,len(speeches)-seq_len)]
sentences = [[speeches[y] for y in x] for x in sentence_ranges]
sentences_indexed = [[speeches_indexed[y] for y in x] for x in sentence_ranges]

In [13]:
# Function from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Function modified from https://github.com/enriqueav/lstm_lyrics/blob/master/lstm_train.py
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed = (sentences_indexed_test)[np.random.randint(len(sentences_indexed_test))]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('\n----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join([index_word[x] for x in sentence]) + '"\n')

        sentence = sentence.copy()
        full_sentence = sentence.copy()
        x_pred = np.zeros((batch_size, seq_len-1), dtype=np.int)
        
        for i in range(50):
            x_pred[0,] = sentence[:-1]
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = next_index
            sentence = sentence[1:]
            sentence.append(next_word)
            full_sentence.append(next_word)
            
        full_sentence = [index_word[x] for x in full_sentence]
        examples_file.write(' '.join(full_sentence))
    examples_file.write('\n' + '='*80 + '\n\n')
    examples_file.flush()

In [23]:
random.shuffle(sentences_indexed)
train_split = int(0.95*len(sentences_indexed))
X = np.zeros((len(sentences_indexed), seq_len-1), dtype=np.int)
y = np.zeros((len(sentences_indexed), n_words), dtype=np.bool)
for i, sentence_idx in enumerate(sentences_indexed):
    X[i,] = sentence_idx[:-1]
    y[i, sentence_idx[-1]] = 1
X_train, y_train = X[:train_split],y[:train_split]
X_test, y_test = X[train_split:],y[train_split:]

In [15]:
def get_model():
    print('Build model...')
    model = Sequential()
    model.add(Embedding(n_words, 50, input_length=seq_len-1))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_words, activation='softmax'))
    return model

In [None]:
model = get_model()
optimizer = Adam(lr=0.007)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [print_callback, early_stopping]

examples_file = open(examples_file_loc, "w")

model.fit(X,y,epochs=100, batch_size= 128)
                    # callbacks=callbacks_list)

Build model...
Epoch 1/100
 29184/185632 [===>..........................] - ETA: 2:28 - loss: 6.0055 - acc: 0.0888