In [1]:
import sys
import time
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder
from tensorflow.contrib.keras.python.keras.utils import np_utils

enc = OneHotEncoder()

def progress_bar(value, endvalue, bar_length=20):
    percent = float(value) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    sys.stdout.write("\rPercent complete: [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
    sys.stdout.flush()

In [2]:
# Load and prepare data
with open('data/fellowship_text.txt', 'rb') as f:
    text_data = f.read()
text_data = text_data.decode("utf-8")
text_data.replace('\n', '')

# Get chars and create dictionary lookups
char_list = sorted(list(set(text_data)))
vocab_size = len(char_list)
n_chars = len(text_data)
ix_to_char = {ix:char for ix, char in enumerate(char_list)}
char_to_ix = {char:ix for ix, char in enumerate(char_list)}
print("Vocab size = {} characters".format(len(char_list)))
print("Data size = {} characters".format(n_chars))

def one_hot_encode(idx, vocab_size=95):
    seq = [0] * vocab_size
    seq[idx] = 1
    return seq

# Split into sequences
seq_len = 100
data_x = []
data_y = []
for i in range(0, n_chars - seq_len, 1):
    seq_in = text_data[i:i + seq_len]
    seq_out = text_data[i + seq_len]
    data_x.append([char_to_ix[char] for char in seq_in])
    data_y.append(char_to_ix[seq_out])
    if i % 100000 == 0:
        print("Running for pattern {}".format(i))
n_patterns = len(data_x)
print("Total of {} patterns".format(n_patterns))

# Fit sklearn encoder
enc.fit(data_x[0:300000])
# enc.n_values_

Vocab size = 95 characters
Data size = 984386 characters
Running for pattern 0
Running for pattern 100000
Running for pattern 200000
Running for pattern 300000
Running for pattern 400000
Running for pattern 500000
Running for pattern 600000
Running for pattern 700000
Running for pattern 800000
Running for pattern 900000
Total of 984286 patterns


OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [3]:
# Tensorflow graph
tf.reset_default_graph()
learning_rate = 0.01
b_size = 1000
k_prob = 0.75
cell_size = 64
n_hidden = 64
num_layers = 3

x = tf.placeholder(tf.float32, [None, seq_len, vocab_size])
keep_prob = tf.placeholder(tf.float32)

def lstm_text(x, keep_prob, cell_size, n_hidden, learning_rate, seq_len, num_layers):
    """Build TF computation graph"""
    W_out = tf.get_variable("W_out", [cell_size, vocab_size],
                            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01))
    b_out = tf.get_variable("b_out", [vocab_size],initializer=tf.constant_initializer(0.01))

    # LSTM
    lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=cell_size) for _ in range(num_layers)]
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells, state_is_tuple=True)
    outputs, state = tf.nn.dynamic_rnn(cell=stacked_lstm, dtype=tf.float32, inputs=x)
    
    # Fully-connected
    outputs = tf.reshape(outputs, [-1, cell_size])
    outputs = tf.matmul(outputs, W_out) + b_out
    outputs = tf.reshape(outputs, [-1, seq_len, vocab_size])
    outputs = outputs[:, :-1, :]
    x_out = x[:, 1:, :]

    # Loss
    acc = tf.cast(tf.equal(tf.round(tf.sigmoid(outputs)), x_out), 'float')
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=x_out, logits=outputs), 
                         reduction_indices=[0, 1])
    opt = tf.train.AdamOptimizer(learning_rate).minimize(tf.reduce_mean(loss))
    return loss, opt, acc

loss, opt, acc = lstm_text(x, keep_prob, cell_size, n_hidden, learning_rate, seq_len, num_layers)

In [None]:
# Run TF session
init = tf.global_variables_initializer()
saver = tf.train.Saver()

n_epochs = 1
n_batches = int(len(data_x) / b_size)
# n_batches = 10
results_dict = {'epoch': [], 'batch': [], 'loss': []}

with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(n_epochs):
        t0 = time.time()
        print("#-------- Starting epoch {}, running for {} batches".format(epoch+1, n_batches))
        for batch in range(n_batches):
    #         progress_bar(n_batch+1, n_batches)
            sample = data_x[batch*b_size:(batch+1)*b_size]
            sample = np.array([[one_hot_encode(elem) for elem in s] for s in sample])
            l, o, a = sess.run([loss, opt, acc], feed_dict={x: sample, keep_prob: k_prob})
        
            if batch % 50 == 0:
                print("Run for {} batches, loss = {}".format(batch, float(l)))
                results_dict['epoch'].append(epoch+1)
                results_dict['batch'].append(batch)
                results_dict['loss'].append(float(l))
                saver.save(sess, 'models/lstm_text_model')
                
        epoch_time = time.time() - t0
        print("#--------- Finished run for epoch {} in time {}".format(epoch+1, epoch_time))
    
    sess.close()

#-------- Starting epoch 1, running for 984 batches
Run for 0 batches, loss = 4.554446220397949
Run for 50 batches, loss = 3.0754315853118896


In [None]:
# Plot learning curves

In [104]:
# Generate text
tf.reset_default_graph()

des_len = 100
x_first = tf.placeholder(tf.float32, [None, 1, vocab_size])

def test_lstm(x_first, cell_size, vocab_size, des_len):
    """Build computation graph for NN to generate text"""
    W_out = tf.get_variable("W_out", [cell_size, vocab_size],
                            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01))
    b_out = tf.get_variable("b_out", [vocab_size],initializer=tf.constant_initializer(0.01))
    predicted_probs = []
    
    # First LSTM
    lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=cell_size) for _ in range(num_layers)]
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells, state_is_tuple=True)
    outputs, state = tf.nn.dynamic_rnn(cell=stacked_lstm, dtype=tf.float32, inputs=x_first)

    outputs = tf.reshape(outputs, [-1, cell_size])
    outputs = tf.matmul(outputs, W_out) + b_out
    outputs = tf.reshape(outputs, [-1, 1, vocab_size])
    outputs = tf.nn.softmax(tf.reshape(outputs[:, -1, :], [-1, 1, vocab_size]), dim=-1)
    predicted_probs.append(outputs)
    
    # Feed this output until we've generated des_len characters
    tf.get_variable_scope().reuse_variables()
    for i in range(des_len-1):
        outputs, state = tf.nn.dynamic_rnn(cell=stacked_lstm, dtype=tf.float32, 
                                           inputs=tf.round(outputs), initial_state=state)
        outputs = tf.reshape(outputs, [-1, cell_size])
        outputs = tf.matmul(outputs, W_out) + b_out
        outputs = tf.reshape(outputs, [-1, 1, vocab_size])
        outputs = tf.nn.softmax(tf.reshape(outputs[:, -1, :], [-1, 1, vocab_size]), dim=-1)
        predicted_probs.append(outputs)
        
    return predicted_probs
    
predicted_probs = test_lstm(x_first, cell_size, vocab_size, des_len)

In [110]:
test_char = 't'

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    print("Reloading model...")
    saver = tf.train.import_meta_graph('models/lstm_text_model.meta')
    saver.restore(sess, tf.train.latest_checkpoint('models/'))
    all_vars = tf.get_collection('vars')
    
    test_array = np.array(one_hot_encode(char_to_ix[test_char])).reshape(1, 1, vocab_size)
    pixel_probs = sess.run(predicted_probs, feed_dict={x_first: test_array})

    sess.close()

Reloading model...
INFO:tensorflow:Restoring parameters from models/lstm_text_model


In [121]:
output_string = ''.join([ix_to_char[np.argmax(a)] for a in pixel_probs])
output_string

'HHHFFFFFHHHTTT333333333333333___________________________‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚‚_______AA000000000))\n\n\n\n'