In [1]:
##################################
#
# Implementation of linear logic recurrent neural network
#
# The architecture is a modified RNN, see the paper "Linear logic and recurrent neural networks".
# Our inputs are sequences of symbols taken from an alphabet of size num_classes. The length
# of the sequences is N. Our outputs are also sequences of length N from the same alphabet.
#
# Here "symbol" means a one hot vector.

# The next three lines are recommend by TF
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import collections
import six
import math
import time

from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops.math_ops import sigmoid
from tensorflow.python.ops.math_ops import tanh

# Our libraries
import ntm
import seqhelper
import learnfuncs

In [2]:
##############
# GLOBAL FLAGS

num_classes = 2
batch_size = 500 # take a smaller batch size (500 works) on Tesla
input_size = num_classes # dimension of the input space I
N = 20 # length of input sequences
N_out = 40 # length of output sequences
training_percent = 0.01 # percentage used for training
epoch = 200

controller_state_size = 100 # dimension of the internal state space of the controller
memory_address_size = 20 # number of memory locations
memory_content_size = 5 # size of vector stored at a memory location
pattern_ntm_powers1 = [-1,0,1]

use_model = 'ntm'

if( use_model == 'ntm' ):
    state_size = controller_state_size + 2*memory_address_size + memory_address_size * memory_content_size
elif( use_model == 'pattern_ntm'):
    state_size = controller_state_size + 4*memory_address_size + \
                memory_address_size * memory_content_size + \
                memory_address_size * len(pattern_ntm_powers1)

print("Total state size: " + str(state_size))

Total state size: 240


In [3]:
#######################
# PREPARE TRAINING DATA
#
# Our sequences are of one-hot vectors, which we interpret as follows:
#
# [1.0, 0.0, 0.0] = 0
# [0.0, 1.0, 0.0] = 1
# [0.0, 0.0, 1.0] = 2 etc
#
# We write our sequences and functions referring to sequences of integers,
# and then convert to one-hot vectors for integration with TF.

###########
# COPY TASK
#task = 'copy'
#func_to_learn = learnfuncs.f_identity
###########

##################
# REPEAT COPY TASK
# put n zeros before the 1, for a copy task with n + 1 copies
#task = 'repeat copy'
#pattern = [0,1]
#func_to_learn = lambda s: learnfuncs.f_repetitionpattern(s,pattern)
##################

##############
# PATTERN TASK
task = 'pattern'
pattern = [1,0,0,2,0]
func_to_learn = lambda s: learnfuncs.f_repetitionpattern(s,pattern)
##############

# Create a shuffled list of all binary sequences of length N
seq_input = seqhelper.shuffled_binary_seqs(N)

one_hots = seqhelper.one_hot_vectors(num_classes)

seq_input_onehot = []
for i in seq_input:
    temp_list = []
    for j in i:
        temp_list.append(one_hots[j])
    seq_input_onehot.append(np.array(temp_list))

# Training output
seq_output = []

for i in seq_input:
    seq_output.append(func_to_learn(i))

seq_output_onehot = []
for i in seq_output:
    temp_list = []
    for j in i:
        temp_list.append(one_hots[j])
    seq_output_onehot.append(np.array(temp_list))

NUM_EXAMPLES = int(training_percent * len(seq_input))

test_input = seq_input_onehot[NUM_EXAMPLES:3*NUM_EXAMPLES]
test_output = seq_output_onehot[NUM_EXAMPLES:3*NUM_EXAMPLES]
train_input = seq_input_onehot[:NUM_EXAMPLES]
train_output = seq_output_onehot[:NUM_EXAMPLES]

print("Number of training examples: " + str(NUM_EXAMPLES) + " out of " + str(len(seq_input)) + " sequences.")
print("")
print("Under the chosen function, the sequence")
print(seq_input[0])
print("which is encoded as")
print(seq_input_onehot[0])
print("is mapped to")
print(seq_output[0])
print("which is encoded as")
print(seq_output_onehot[0])

#print("")
#print("The first one-hot encoded digit of the first three output sequences")
#print(test_output[0][0])
#print(test_output[1][0])
#print(test_output[2][0])

Number of training examples: 10485 out of 1048576 sequences.

Under the chosen function, the sequence
[0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0]
which is encoded as
[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]]
is mapped to
[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
which is encoded as
[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


In [4]:
#########################
# Definition of the model

# inputs, we create N of them, each of shape [None,input_size], one for
# each position in the sequence
inputs = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(N)]
targets = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(N_out)]

# state_size is the number of hidden neurons in each layer

if( use_model == 'ntm' ):
    cell = ntm.NTM(state_size,input_size,controller_state_size,
                   memory_address_size,memory_content_size, [-1,0,1])
elif( use_model == 'pattern_ntm' ):
    cell = ntm.PatternNTM(state_size,input_size,controller_state_size,
                          memory_address_size,memory_content_size, pattern_ntm_powers1, [-1,0,1])

state = cell.zero_state(batch_size, tf.float32)

reuse = False

for i in range(N):
    output, state = cell(inputs[i],state,'NTM',reuse)
    reuse = True

# We only start recording the outputs of the controller once we have
# finished feeding in the input. We feed zeros as input in the second phase.
rnn_outputs = []
for i in range(N_out):
    output, state = cell(tf.zeros([batch_size,input_size]),state,'NTM',reuse)
    rnn_outputs.append(output)

# Final fully connected layer
E = tf.Variable(tf.truncated_normal([controller_state_size,input_size]))
F = tf.Variable(tf.constant(0.1, shape=[input_size]))

# prediction is a length N list of tensors of shape [None,input_size], where
# the jth row of prediction[d] is, for the jth input sequence in the batch,
# the probability distribution over symbols for the output symbol in position d.
logits = [tf.matmul(rnn_output, E) + F for rnn_output in rnn_outputs]
prediction = [tf.nn.softmax(logit) for logit in logits] 
ce = [tf.reduce_sum(targets[i] * tf.log(prediction[i])) for i in range(N_out)]

cross_entropy = -tf.add_n(ce)
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cross_entropy)

mistakes = [tf.not_equal(tf.argmax(targets[i], 1), tf.argmax(prediction[i], 1)) for i in range(N_out)]
errors = [tf.reduce_mean(tf.cast(m, tf.float32)) for m in mistakes]

[<tf.Tensor 'gradients/NTM_59/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_58/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_57/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_56/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_55/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_54/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_53/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_52/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_51/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_50/split_grad/concat:0' shape=(500, 240) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_49/split_grad

In [5]:
# Initialise the model
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

In [None]:
# Display the errors before training
feed_dict = {}
test_input_batch = test_input[:batch_size]
test_output_batch = test_output[:batch_size]
for d in range(N):
    in_node = inputs[d]
    ti = []
    for k in range(len(test_input_batch)):
        ti.append(test_input_batch[k][d]) # A vector giving the one-hot encoding of the dth symbol in the kth sequence
    feed_dict[in_node] = np.array(ti)
    
for d in range(N_out):
    out_node = targets[d]
    to = []
    for k in range(len(test_output_batch)):
        to.append(test_output_batch[k][d])
    feed_dict[out_node] = np.array(to)

# The first three digits of this should match the printout for the
# first three test output sequences given earlier
#print(sess.run(tf.argmax(targets[0],1),feed_dict))
#print(sess.run(tf.argmax(prediction[0],1),feed_dict))
#print(sess.run(tf.not_equal(tf.argmax(targets[0], 1), tf.argmax(prediction[0], 1)),feed_dict))

print("")
print("The mean of the errors in each digit for the test set:")
incorrects = sess.run(errors, feed_dict)
print(incorrects)
print("Mean: " + str(np.mean(incorrects)))


The mean of the errors in each digit for the test set:
[0.49599999, 0.51799995, 0.55599999, 0.50800002, 0.54000002, 0.40000001, 0.48199999, 0.52199996, 0.472, 0.60000002, 0.44799998, 0.442, 0.48199999, 0.56200004, 0.55199999, 0.46199998, 0.46600002, 0.55599999, 0.412, 0.63599998, 0.41199997, 0.34399998, 0.56800002, 0.49799997, 0.59999996, 0.50999999, 0.514, 0.45799997, 0.41999999, 0.44999996, 0.42199999, 0.40999997, 0.57799995, 0.53600001, 0.44400001, 0.54000002, 0.52999997, 0.46799999, 0.50800002, 0.48199999]
Mean: 0.4951


In [None]:
pre_train_time = time.time()

# Training
no_of_batches = int(len(train_input)/batch_size)
#print("Number of batches: " + str(no_of_batches))

# An annoying thing here is that we cannot use a list as a key in a 
# dictionary. The workaround we found on StackOverflow here:
# http://stackoverflow.com/questions/33684657/issue-feeding-a-list-into-feed-dict-in-tensorflow)

for i in range(epoch):
    ptr = 0
    for j in range(no_of_batches):
        inp = train_input[ptr:ptr+batch_size]
        out = train_output[ptr:ptr+batch_size]
        ptr += batch_size
        
        feed_dict = {}
        for d in range(N):
            in_node = inputs[d]
            # inp has dimensions [batch_size, N, num_classes] and we want to extract
            # the 2D Tensor of shape [batch_size, num_classes] obtained by setting the
            # second coordinate to d
            ti = []
            for k in range(batch_size):
                ti.append(inp[k][d])
            feed_dict[in_node] = np.array(ti)

        for d in range(N_out):
            out_node = targets[d]
            to = []
            for k in range(batch_size):
                to.append(out[k][d])
            feed_dict[out_node] = np.array(to)
            
        sess.run(minimize, feed_dict)
    current_mean = np.mean(sess.run(errors, feed_dict))
    print("Epoch - " + str(i+1) + ", Mean error of final batch in epoch - " + str(current_mean))

print("")
print("It took", time.time() - pre_train_time, "seconds to train.")

Epoch - 1, Mean error of final batch in epoch - 0.45075
Epoch - 2, Mean error of final batch in epoch - 0.4046
Epoch - 3, Mean error of final batch in epoch - 0.38095
Epoch - 4, Mean error of final batch in epoch - 0.34025
Epoch - 5, Mean error of final batch in epoch - 0.3124
Epoch - 6, Mean error of final batch in epoch - 0.3013
Epoch - 7, Mean error of final batch in epoch - 0.2874
Epoch - 8, Mean error of final batch in epoch - 0.28975
Epoch - 9, Mean error of final batch in epoch - 0.27625


In [None]:
# Calculate the error over the test set

no_of_batches = int(len(test_input)/batch_size)
#print("Number of batches: " + str(no_of_batches))

error_means = []
ptr = 0
for j in range(no_of_batches):
    inp = test_input[ptr:ptr+batch_size]
    out = test_output[ptr:ptr+batch_size]
    ptr += batch_size
        
    feed_dict = {}
    for d in range(N):
        in_node = inputs[d]
        ti = []
        for k in range(batch_size):
            ti.append(inp[k][d])
        feed_dict[in_node] = np.array(ti)

    for d in range(N_out):
        out_node = targets[d]
        to = []
        for k in range(batch_size):
            to.append(out[k][d])
        feed_dict[out_node] = np.array(to)
            
    current_mean = np.mean(sess.run(errors, feed_dict))
    error_means.append(current_mean)

final_error = np.min(error_means)

# The first three digits of this should match the printout for the
# first three test output sequences given earlier
#data = sess.run([tf.argmax(targets[0],1), tf.argmax(prediction[0],1)],feed_dict)

#print("First digits of test outputs (actual)")
#print(data[0])
#print("First digits of test outputs (predicted)")
#print(data[1])

# print the mean of the errors in each digit for the test set.
#incorrects = sess.run(errors, feed_dict)
# print(incorrects)

print("############################")
print("# Summary ")
print("############################")
print("# model = " + use_model)
print("# task = " + task)
print("# training_percent = " + str(training_percent))
print("# epoch = " + str(epoch))
print("# (css,mas,mcs) = (" + str(controller_state_size) + "," + str(memory_address_size) + "," + str(memory_content_size) + ")")
#print("# powers1 = " + str(pattern_ntm_powers1))
print("# number of weights = " + str(ntm.count_number_trainable_params()))
print("# error = " + str(final_error))
sess.close()