In [1]:
import tensorflow as tf 
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3091932945239389015
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3177234432
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2065386910885991521
physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


In [2]:
import sys
sys.version

'3.5.0 |Continuum Analytics, Inc.| (default, Dec  1 2015, 11:46:22) [MSC v.1900 64 bit (AMD64)]'

In [3]:
"""
Example script to generate text from a corpus of text
--By word--
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
Based on
https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
20 epochs should be enough to get decent results.
Uses data generator to avoid loading all the test set into memory.
Saves the weights and model every epoch.
"""

from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional
import keras.utils
import numpy as np
import random
import sys
import io
import os
import re
import itertools
from collections import Counter

Using TensorFlow backend.


In [15]:
# Parameters
seq_len = 11 # includes next word.
step = 1
word_freq_threshold = 2

# NN parameters
batch_size = 64

examples_file_loc = 'examples/examples.txt'

In [5]:
# Read file
file = open('data/speeches.txt', 'r',encoding='utf-8-sig') 
speeches = file.read()

# Remove text between brackets, such as (inaudible) or (laughter)
speeches = re.sub("[\(\[].*?[\)\]]", "", speeches)
# Remove the speech introductions
speeches = re.sub(r'SPEECH.+?\n', '', speeches)
# Replace multiple periods with a single one.
speeches = re.sub('\.+','. ',speeches)
# different uses of this character
speeches = re.sub('\'','’',speeches)
# Replace new lines with spaces
speeches = re.sub('\n',' ', speeches)

# Treat the following interpunction characters as separate words, so we can generate them.
speeches = re.sub('\. ',' . ', speeches)
speeches = re.sub(', ',' , ', speeches)
speeches = re.sub('\? ',' ? ', speeches)
speeches = re.sub('! ',' ! ', speeches)
speeches = re.sub('; ',' ; ', speeches)
punc = '.,?!;'

# Keep only this set of characters, replace multiple whitespace with single, and convert to lower case.
speeches = re.sub('[^0-9a-zA-Z\.,\?!;’]+', ' ', speeches)
speeches = re.sub('\s+',' ', speeches)
speeches = speeches.lower()

In [6]:
words = speeches.split(' ')
counts = Counter(words)

# Identify the foribdden words, i.e. words that occur less than a certain threshold.
forbidden_words=[]
for word in list(counts):
    if counts[word] < word_freq_threshold:
        forbidden_words.append(word)
        
# Find the indices of the forbidden words        
forbidden_indices = [[i for i, x in enumerate(words) if x == f_word] for f_word in forbidden_words]
forbidden_indices = list(itertools.chain.from_iterable(forbidden_indices))
forbidden_indices.sort()

# Now, create the ranges of words for the sentences. If a range contains one of the indices in
# forbidden_indices, we omit it from the data.
sentence_ranges = [range(i,i+seq_len-1) for i in range(0,len(words)-seq_len,step)]
s = 0
f = 0
while (s < len(sentence_ranges)) & (f < len(forbidden_indices)):
    if forbidden_indices[f] in sentence_ranges[s]:
        sentence_ranges.pop(s)
    else:
        if max(sentence_ranges[s])>forbidden_indices[f]:
            f+=1
        else:
            s+=1
sentences = [[words[y] for y in x] for x in sentence_ranges]

In [7]:
words = set(words)-set(forbidden_words)

In [8]:
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [9]:
print('Original sentences: ' + str(len([range(i,i+seq_len-1) for i in range(0,len(words)-seq_len,step)])))
print('Truncated sentences: '+ str(len(sentences)))

Original sentences: 3926
Truncated sentences: 167188


In [10]:
# modified from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, sentences, word_indices, batch_size=32, shuffle=True):
        'Initialization'
        self.sentences = sentences
        self.word_indices = word_indices
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.sentences) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of sentences
        sentences_temp = [self.sentences[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(sentences_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.sentences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sentences_temp):
        'Generates data containing batch_size samples' 
        # Initialization
        X = np.zeros((self.batch_size, seq_len-1, len(self.word_indices)), dtype=np.bool)
        y = np.zeros((self.batch_size, len(self.word_indices)), dtype=np.bool)

        # Generate data
        for i, sentence in enumerate(sentences_temp):
            # Generate X                       
            for t, w in enumerate(sentence[:-1]):
                X[i, t, self.word_indices[w]] = 1       
                                           
            y[i, self.word_indices[sentence[-1]]] = 1

        return X, y

In [18]:
# Function from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Function modified from https://github.com/enriqueav/lstm_lyrics/blob/master/lstm_train.py
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences_test))
    seed = (sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('\n----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')

        sentence = sentence.copy()
        full_sentence = sentence.copy()

        for i in range(50):
            x_pred = np.zeros((1, seq_len-1, len(word_indices)))
            for t, word in enumerate(sentence):
                x_pred[0, t, word_indices[word]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            sentence = sentence[1:]
            sentence.append(next_word)
            full_sentence.append(next_word)
        examples_file.write(' '.join(full_sentence))
    examples_file.write('\n' + '='*80 + '\n\n')
    examples_file.flush()

In [24]:
# Train test split
random.shuffle(sentences)
train_split = int(0.98*len(sentences))
sentences_train = sentences[:train_split]
sentences_test = sentences[train_split:]

In [25]:
# Train test split
random.shuffle(sentences)
train_split = int(0.98*len(sentences))
sentences_train = sentences[:train_split]
sentences_test = sentences[train_split:]

In [26]:
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(seq_len-1, len(word_indices))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model

In [27]:
model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [print_callback, early_stopping]

examples_file = open(examples_file_loc, "w")

model.fit_generator(DataGenerator(sentences_train, word_indices, batch_size),
                    steps_per_epoch=int(len(sentences)/batch_size) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=DataGenerator(sentences_test, word_indices, batch_size),
                    validation_steps=int(len(sentences_test)/batch_size) + 1)

Build model...
[160983 139822   7253 ... 122265  35627  14083]
[2833 2456 3016 ... 2504 1456 2335]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
[1992 2757  961 ... 2942 3128 2500]
Epoch 5/100
[ 959  737 2278 ...  705 2676 2399]
Epoch 6/100
[ 806 2438 3218 ...  631 2394 1401]
Epoch 7/100
[ 130 3273  931 ...  982 1775 1119]
Epoch 8/100
[2853 2689 2354 ... 1562 2441 3292]
Epoch 9/100
[1972 1947 2383 ... 1564 2782 2078]
Epoch 10/100
[ 223 1678 3260 ... 1911 2542  738]
Epoch 11/100
[2819 2365  650 ...  131 2756 2628]
Epoch 12/100

Epoch 13/100
[1128 1691 2029 ... 1187 1141 1273]
Epoch 14/100
[1673 1106  438 ...  643 1358 1305]
Epoch 15/100
Epoch 16/100
[ 759 2832 2318 ... 2864  584 3309]


<keras.callbacks.History at 0x218fca069b0>