In [1]:
%load_ext autoreload 
%autoreload 2

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping, CSVLogger
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import Tokenizer
from keras.models import load_model
import keras.utils
import numpy as np
import random
import sys
import io
import os
import re
import itertools
import time
import json
from collections import Counter

from src.read_data import read_trump_speeches
from src.utils import print_tensorflow_devices
from src.data_generator import DataGenerator
from src.examples_generator_callback import ExamplesGeneratorCallback
print_tensorflow_devices()

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7591652797108776567
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3177234432
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10581531973759208939
physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


In [2]:
# Parameters
seq_len = 21 # includes next word.
batch_size = 128
epochs = 50

In [3]:
speeches = read_trump_speeches('data/speeches.txt')
words = np.unique(speeches)
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))
n_words = len(words)

# Index the speeches, then create sentences of length 'seq_len' that we can train the RNN on.
speeches_indexed = [word_index[x] for x in speeches]
sentence_ranges = [range(i,i+seq_len) for i in range(0,len(speeches)-seq_len)]
sentences = [[speeches[y] for y in x] for x in sentence_ranges]
sentences_indexed = [[speeches_indexed[y] for y in x] for x in sentence_ranges]

# Train-test split
random.shuffle(sentences_indexed)
train_split = int(0.95*len(sentences_indexed))
sentences_indexed_train = sentences_indexed[:train_split]
sentences_indexed_test = sentences_indexed[train_split:]
print('Train: ' + str(len(sentences_indexed_train)))
print('Test: ' +str(len(sentences_indexed_test)))

Train: 176350
Test: 9282


In [7]:
def get_model(version):
     
    # slightly bigger
    if version == 1:
        print('Build model...')
        model = Sequential()
        model.add(Bidirectional(LSTM(100, activation="relu", return_sequences=True),input_shape=(seq_len-1, n_words)))
        model.add(Dropout(0.5))
        model.add(Bidirectional(LSTM(100, activation="relu")))
        model.add(Dropout(0.3))
        model.add(Dense(200, activation='relu'))
        model.add(Dense(n_words, activation='softmax'))
        return model
    
     # slightly bigger
    if version == 2:
        print('Build model...')
        model = Sequential()
        model.add(Bidirectional(LSTM(150, activation="relu", return_sequences=True),input_shape=(seq_len-1, n_words)))
        model.add(Dropout(0.5))
        model.add(Bidirectional(LSTM(150, activation="relu")))
        model.add(Dropout(0.5))
        model.add(Dense(250, activation='relu'))
        model.add(Dense(n_words, activation='softmax'))
        return model
    
         # slightly bigger
    if version == 3:
        print('Build model...')
        model = Sequential()
        model.add(Bidirectional(LSTM(200, activation="relu", return_sequences=True),input_shape=(seq_len-1, n_words)))
        model.add(Dropout(0.4))
        model.add(Bidirectional(LSTM(100, activation="relu")))
        model.add(Dropout(0.4))
        model.add(Dense(150, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(150, activation='relu'))
        model.add(Dense(n_words, activation='softmax'))
        return model


In [8]:
for version in [1,2,3]:

    model_id = time.strftime("%Y%m%d-%H%M%S")
    log_dir = 'log/' + model_id
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    model_to_be_loaded = None
    if model_to_be_loaded is not None:
        model = load_model('models/' + model_to_be_loaded)
    else:
        model = get_model(version)

    optimizer = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    with open(log_dir + '/summary.txt','w') as fh:
        # Pass the file handle in as a lambda function to make it callable
        model.summary(print_fn=lambda x: fh.write(x + '\n'))
        
    with open(log_dir + '/configuration.txt', 'w') as fout:
        json.dump(model.get_config(), fout)

    print_callback = ExamplesGeneratorCallback(sentences_indexed_test, index_word, log_dir + '/examples.txt', seq_len, n_words)
    early_stopping = EarlyStopping(monitor='val_acc', patience=5)
    csv_logger = CSVLogger(log_dir + '/loss_log.csv', append=True, separator=';')
    callbacks_list = [print_callback, early_stopping, csv_logger]

    history = model.fit_generator(DataGenerator(sentences_indexed_train, seq_len, n_words, batch_size),
                        steps_per_epoch=int(len(sentences_indexed_train)/batch_size) + 1,
                        epochs=epochs,
                        callbacks=callbacks_list,
                        validation_data=DataGenerator(sentences_indexed_test, seq_len, n_words, batch_size),
                        validation_steps=int(len(sentences_indexed_test)/batch_size) + 1)

    model.save(log_dir + '/' + time.strftime("%Y%m%d-%H%M%S") + '.h5')

Build model...
Epoch 1/50
Epoch 2/50
Epoch 3/50


  preds = np.log(preds) / temperature


Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Build model...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Build model...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


In [6]:
model = load_model('models/LSTM256-Dr30-De100-acc0.45-valacc-0.25.h5')

In [None]:
prediction_model = '20180831-163242.h5'
model = load_model('models/' + prediction_model)

for i in range(10):
    print('\n ------------------------------------------------------------------- \n\n')
        # Randomly pick a seed sequence
    seed = (sentences_indexed)[np.random.randint(len(sentences_indexed))]
    sentence = seed.copy()

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed

        full_sentence = sentence.copy()

        for i in range(250): 
            sentence = sentence[1:]
            x_pred = np.zeros((1, seq_len - 1, n_words), dtype=np.bool)
            for t, w in enumerate(sentence):
                x_pred[0, t, w] = 1

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            sentence.append(next_index)
            full_sentence.append(next_index)

        full_sentence = [index_word[x] for x in full_sentence]
        full_sentence = ' '.join(full_sentence)
        print(full_sentence)
        print('\n\n\n')

In [None]:
model.