In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from time import gmtime, strftime
import os
import re
import pickle
import random
import sys

Using TensorFlow backend.


In [2]:
# load ascii text from file
filename = "data/obama.txt"
raw_text = open(filename).read()

# get rid of any characters other than letters, numbers, 
# and a few special characters
raw_text = re.sub('[^\nA-Za-z0-9 ,.:;?!-]+', '', raw_text)

# convert all text to lowercase
raw_text = raw_text.lower()

n_chars = len(raw_text)
print "length of text:", n_chars
print "text preview:", raw_text[:500]

length of text: 18312
text preview: wherever i go these days, at home or abroad, people ask me the same question: what is happening in the american political system? how has a country that has benefitedperhaps more than any otherfrom immigration, trade and technological innovation suddenly developed a strain of anti-immigrant, anti-innovation protectionism? why have some on the far left and even more on the far right embraced a crude populism that promises a return to a past that is not possible to restoreand that, for most americ


In [3]:
# extract all unique characters in the text
chars = sorted(list(set(raw_text)))
n_vocab = len(chars)
print "number of unique characters found:", n_vocab

# create mapping of characters to integers and back
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# test our mapping
print 'a', "- maps to ->", char_to_int["a"]
print 25, "- maps to ->", int_to_char[25]

number of unique characters found: 44
a - maps to -> 18
25 - maps to -> h


In [4]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100

inputs = []
outputs = []

for i in range(0, n_chars - seq_length, 1):
    inputs.append(raw_text[i:i + seq_length])
    outputs.append(raw_text[i + seq_length])
    
n_sequences = len(inputs)
print "Total sequences: ", n_sequences

Total sequences:  18212


In [5]:
indeces = range(len(inputs))
random.shuffle(indeces)

inputs = [inputs[x] for x in indeces]
outputs = [outputs[x] for x in indeces]

In [6]:
print inputs[0], "-->", outputs[0]

mericans are as a people. we dont begrudge success, we aspire to it and admire those who achieve it. -->  


In [7]:
# create two empty numpy array with the proper dimensions
X = np.zeros((n_sequences, seq_length, n_vocab), dtype=np.bool)
y = np.zeros((n_sequences, n_vocab), dtype=np.bool)

# iterate over the data and build up the X and y data sets
# by setting the appropriate indices to 1 in each one-hot vector
for i, example in enumerate(inputs):
    for t, char in enumerate(example):
        X[i, t, char_to_int[char]] = 1
    y[i, char_to_int[outputs[i]]] = 1
    
print 'X dims -->', X.shape
print 'y dims -->', y.shape

X dims --> (18212, 100, 44)
y dims --> (18212, 44)


In [8]:
# define the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=False, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.50))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [9]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [10]:
def generate(sentence, prediction_length=50, diversity=0.35):
    print '----- diversity:', diversity 

    generated = sentence
    sys.stdout.write(generated)

    # iterate over number of characters requested
    for i in range(prediction_length):
        
        # build up sequence data from current sentence
        x = np.zeros((1, X.shape[1], X.shape[2]))
        for t, char in enumerate(sentence):
            x[0, t, char_to_int[char]] = 1.

        # use trained model to return probability distribution
        # for next character based on input sequence
        preds = model.predict(x, verbose=0)[0]
        
        # use sample() function to sample next character 
        # based on probability distribution and desired diversity
        next_index = sample(preds, diversity)
        
        # convert integer to character
        next_char = int_to_char[next_index]

        # add new character to generated text
        generated += next_char
        
        # delete the first character from beginning of sentance, 
        # and add new caracter to the end. This will form the 
        # input sequence for the next predicted character.
        sentence = sentence[1:] + next_char

        # print results to screen
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print

In [11]:
filepath="-basic_LSTM.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [12]:
epochs = 50
prediction_length = 100

for iteration in range(epochs):
    
    print 'epoch:', iteration + 1, '/', epochs
    model.fit(X, y, validation_split=0.2, batch_size=256, nb_epoch=1, callbacks=callbacks_list)
    
    # get random starting point for seed
    start_index = random.randint(0, len(raw_text) - seq_length - 1)
    # extract seed sequence from raw text
    seed = raw_text[start_index: start_index + seq_length]
    
    print '----- generating with seed:', seed
    
    for diversity in [0.5, 1.2]:
        generate(seed, prediction_length, diversity)

epoch: 1 / 50
Train on 14569 samples, validate on 3643 samples
Epoch 1/1
----- generating with seed: rked by uncertainty and unease. so we have a choiceretreat into old, closed-off economies or press f
----- diversity: 0.5
rked by uncertainty and unease. so we have a choiceretreat into old, closed-off economies or press f i e sr rorn e  rcienst t el  reptetar e e areetr antn neasentt hwo   r epe e os oi  ai  ti io t ttp
----- diversity: 1.2
rked by uncertainty and unease. so we have a choiceretreat into old, closed-off economies or press fzeeotretloe6na pu?jdyie
ukrtnsyeetslausyoauoheglr?anha tlim t edfo
enrwyft2cse
refwapb lertiigd r
oe
epoch: 2 / 50
Train on 14569 samples, validate on 3643 samples
Epoch 1/1
----- generating with seed:  enacted during my administration have increased the share of income received by all other families 
----- diversity: 0.5
 enacted during my administration have increased the share of income received by all other families teeoee  ret iutea  nth etee atr

In [13]:
pickle_file = '-basic_data.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'X': X,
        'y': y,
        'int_to_char': int_to_char,
        'char_to_int': char_to_int,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print 'Unable to save data to', pickle_file, ':', e
    raise
    
statinfo = os.stat(pickle_file)
print 'Saved data to', pickle_file
print 'Compressed pickle size:', statinfo.st_size

Saved data to -basic_data.pickle
Compressed pickle size: 80934860
