# Do It Yourself LSTM with TensorFlow

Based on the post [Understanding LStM Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) by Colah

## 1. Data Preparation

In [1]:
import numpy as np
from pprint import pprint
import datetime

import data_generator

sequence_length = 6

reference_input_data, reference_output_data = data_generator.getSequences(sequence_length)

# data_generator.getSequences(sequence_length) generates all possible combinations of
# the characters '+-0I', so for a sequence length of 6 characters there are a
# a total of 4^6 = 4096 possible combinations. Some Examples:
# '+-+-+-' = 0
# '------' = -6
# '0++000' = 2
# 'I++000' = -2
#
# Those sequences are encoded: Every character is representated by a vector, so the actual
# return value from data_generator.getSequences looks like this:
pprint(reference_input_data[0])

# There is a helper to decode that again:
pprint(data_generator.decodeSequence(reference_input_data[0]))

# The solution for that sequence is:
pprint(reference_output_data[0])

instruction_count = np.array(reference_input_data).shape[2]

array([[1, 0, 0],
       [0, 1, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 0, 0],
       [1, 0, 0]])
'+-++0+'
3


In [2]:
NUM_EXAMPLES = len(reference_input_data) / 4 # we use 1/4 of the data for the training

test_input = reference_input_data[NUM_EXAMPLES:]
test_output = reference_output_data[NUM_EXAMPLES:] # everything beyond NUM_EXAMPLES

train_input = reference_input_data[:NUM_EXAMPLES]
train_output = reference_output_data[:NUM_EXAMPLES]

print("We'll train using " + str(NUM_EXAMPLES) + "/" + str(len(reference_input_data)) + " Examples")

We'll train using 1024/4096 Examples


In [3]:
import tensorflow as tf

data = tf.placeholder(tf.float32, [None, sequence_length, instruction_count], name='data')
target = tf.transpose(tf.placeholder(tf.float32, [None], name='target'))

## 2. LSTM Layer

In [4]:
LSTM_SIZE = 24
FEATURE_SIZE = 3 # Ace of Hearts, Ace of Clubs, King of Spades

In [5]:
def default_weights_and_bias():
    Weights = tf.Variable(tf.truncated_normal([LSTM_SIZE, LSTM_SIZE + FEATURE_SIZE], -0.2, 0.1))
    bias = tf.Variable(tf.constant(0.0, shape = [LSTM_SIZE, 1]))
    
    return Weights, bias

### 2.1 Forget Layer

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-f.png)

In [6]:
W_f, _ = default_weights_and_bias()

b_f = tf.Variable(tf.constant(1.0, shape = [LSTM_SIZE, 1]))

# The forget layer
#
# Shapes:
#   - W_f: 24x27
#   - ht_minus_1_and_xt: 27x?
#   - b_f: 24x1
#   - f_t: 24x?
def f_t(ht_minus_1_and_xt):
    return tf.sigmoid(tf.matmul(W_f, ht_minus_1_and_xt) + b_f)

### 2.2 New Candidate Conveyor
![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-i.png)

In [7]:
W_i, b_i = default_weights_and_bias()

# Input Gate Layer
#
# Shapes:
#   - W_i: 24x27
#   - ht_minus_1_and_xt: 27x?
#   - b_i: 24x1
#   - i_t: 24x?
def i_t(ht_minus_1_and_xt):
    return tf.sigmoid(tf.matmul(W_i, ht_minus_1_and_xt) + b_i)

W_C, b_c = default_weights_and_bias()

# New Candidates for the Conveyor
#
# Shapes:
#   - W_C: 24x27
#   - ht_minus_1_and_xt: 27x?
#   - b_c: 24x1
#   - candidate_C_t: 24x?
def candidate_C_t(ht_minus_1_and_xt):
    return tf.tanh(tf.matmul(W_C, ht_minus_1_and_xt) + b_c)

### 2.3 Update Conveyor

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-C.png)

In [8]:
# Updated Conveyor
#
# Shapes:
#   - f_t: 24x?
#   - Conveyor: 24x?
#   - i_t: 24x?
#   - CandidateConveyor: 24x?
def C_t(ht_minus_1_and_xt, Conveyor, CandidateConveyor):
    return f_t(ht_minus_1_and_xt) * Conveyor + i_t(ht_minus_1_and_xt) * CandidateConveyor

### 2.4 Prediction (for current LSTM step)

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-o.png)

In [9]:
W_o, b_o = default_weights_and_bias()

# Updated Conveyor
#
# Shapes:
#   - W_o: 24x27
#   - b_o: 24x1
#   - ht_minus_1_and_xt: 27x?
#   - FinalConveyor: 24x?
#   - o_t: 24x?
#   - h_t: 24x?
def h_t(ht_minus_1_and_xt, FinalConveyor):
    o_t = tf.sigmoid(tf.matmul(W_o, ht_minus_1_and_xt) + b_o)
    
    return o_t * tf.tanh(FinalConveyor)

### 2.5 The LSTM Cell
![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

In [10]:
def lstm_cell(ht_minus_1_and_Conveyor, xt):
    ht_minus_1, Conveyor = ht_minus_1_and_Conveyor
    
    ht_minus_1_and_xt = tf.transpose(tf.concat([ht_minus_1, xt], 1))
    
    CandidateConveyor = candidate_C_t(ht_minus_1_and_xt)
    
    FinalConveyor = C_t(ht_minus_1_and_xt, Conveyor, CandidateConveyor)
    
    lstm_prediction = tf.transpose(h_t(ht_minus_1_and_xt, FinalConveyor))
    
    return(lstm_prediction, FinalConveyor)

In [11]:
data_length = tf.shape(data)[0]

# This loop gets called once for every "timestep" and obtains one column of the input data
def lstm_loop(last_lstm_prediction, last_state, step):
    lstm_prediction, state = lstm_cell((last_lstm_prediction, last_state), data[:, step, :])
    return lstm_prediction, state, tf.add(step, 1)


initial_Conveyor = tf.zeros([LSTM_SIZE, data_length])
initial_prediction = tf.zeros([data_length, LSTM_SIZE])

timesteps = sequence_length

for_each_time_step = lambda a, b, step: tf.less(step, timesteps)

lstm_prediction, lstm_state, _ = tf.while_loop(for_each_time_step, lstm_loop, (initial_prediction, initial_Conveyor, 0), parallel_iterations=32)

In [12]:
weight = tf.Variable(tf.truncated_normal([LSTM_SIZE, 1]))
bias = tf.Variable(tf.constant(0.1, shape=[1]))

prediction = tf.matmul(lstm_prediction, weight) + bias

## 3. Cost & Optimizing

In [13]:
with tf.name_scope('mean_square_error'):
    mean_square_error = tf.reduce_sum(tf.square(tf.subtract(target, tf.unstack(prediction, axis = 1))))
tf.summary.scalar('mean_square_error', mean_square_error)

<tf.Tensor 'mean_square_error_1:0' shape=() dtype=string>

In [14]:
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(mean_square_error)

In [15]:
with tf.name_scope('error'):
    with tf.name_scope('mistakes'):
        mistakes = tf.not_equal(target, tf.round(tf.unstack(prediction, axis = 1)))
    with tf.name_scope('error'):
        error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
tf.summary.scalar('error', error)

<tf.Tensor 'error_1:0' shape=() dtype=string>

## 4. Training

In [16]:
sess = tf.InteractiveSession()
merged = tf.summary.merge_all()

date = str(datetime.datetime.now())
train_writer = tf.summary.FileWriter('logs/selfmade_lstm/' + date + '/train', sess.graph)
test_writer = tf.summary.FileWriter('logs/selfmade_lstm/' + date + '/test', sess.graph)

init_op = tf.global_variables_initializer()
sess.run(init_op)

In [17]:
epoch = 4000

for i in range(epoch):
    if (i + 1) % 20 == 0:
        summary, incorrect, mean_squ_err = sess.run([merged, error, mean_square_error], {data: test_input, target: test_output})
        test_writer.add_summary(summary, i)
        
        print('Epoch {:4d} | incorrect {: 3.1f}% | mean squ error {: 3.1f}'.format(i + 1, incorrect * 100, mean_squ_err))
    else:
        summary, acc = sess.run([merged, error], {data: train_input, target: train_output})
        train_writer.add_summary(summary, i)
    
    sess.run(minimize,{data: train_input, target: train_output})

Epoch   20 | incorrect  77.4% | mean squ error  8864.7
Epoch   40 | incorrect  77.2% | mean squ error  8195.1
Epoch   60 | incorrect  73.9% | mean squ error  7601.0
Epoch   80 | incorrect  74.5% | mean squ error  7347.5
Epoch  100 | incorrect  74.3% | mean squ error  7152.0
Epoch  120 | incorrect  74.1% | mean squ error  6922.2
Epoch  140 | incorrect  73.6% | mean squ error  6489.1
Epoch  160 | incorrect  71.7% | mean squ error  5900.1
Epoch  180 | incorrect  70.0% | mean squ error  5317.5
Epoch  200 | incorrect  66.7% | mean squ error  4767.0
Epoch  220 | incorrect  63.7% | mean squ error  4163.0
Epoch  240 | incorrect  60.4% | mean squ error  3602.8
Epoch  260 | incorrect  57.7% | mean squ error  3204.3
Epoch  280 | incorrect  55.6% | mean squ error  2948.7
Epoch  300 | incorrect  54.0% | mean squ error  2746.1
Epoch  320 | incorrect  52.2% | mean squ error  2578.6
Epoch  340 | incorrect  50.9% | mean squ error  2431.7
Epoch  360 | incorrect  49.5% | mean squ error  2296.8
Epoch  380

Epoch 3080 | incorrect  1.7% | mean squ error  169.1
Epoch 3100 | incorrect  1.7% | mean squ error  168.1
Epoch 3120 | incorrect  1.7% | mean squ error  167.1
Epoch 3140 | incorrect  1.7% | mean squ error  166.1
Epoch 3160 | incorrect  1.7% | mean squ error  165.2
Epoch 3180 | incorrect  2.3% | mean squ error  208.6
Epoch 3200 | incorrect  1.7% | mean squ error  163.4
Epoch 3220 | incorrect  1.7% | mean squ error  162.2
Epoch 3240 | incorrect  1.7% | mean squ error  161.2
Epoch 3260 | incorrect  1.7% | mean squ error  160.5
Epoch 3280 | incorrect  1.7% | mean squ error  159.7
Epoch 3300 | incorrect  1.7% | mean squ error  158.8
Epoch 3320 | incorrect  1.7% | mean squ error  157.9
Epoch 3340 | incorrect  1.7% | mean squ error  157.1
Epoch 3360 | incorrect  1.7% | mean squ error  156.2
Epoch 3380 | incorrect  1.7% | mean squ error  155.4
Epoch 3400 | incorrect  1.7% | mean squ error  154.5
Epoch 3420 | incorrect  1.7% | mean squ error  153.7
Epoch 3440 | incorrect  1.7% | mean squ error 

In [18]:
# Test the result
sess.run(prediction, {data: [data_generator.encodeSequence("00-+++")]})

array([[ 1.98095763]], dtype=float32)

In [19]:
sess.close()
train_writer.close()
test_writer.close()
