In [1]:
##################################
#
# Implementation of linear logic recurrent neural network
#
# The architecture is a modified RNN, see the paper "Linear logic and recurrent neural networks".
# Our inputs are sequences of symbols taken from an alphabet of size num_classes. The length
# of the sequences is N. Our outputs are also sequences of length N from the same alphabet.
#
# Here "symbol" means a one hot vector.

# The next three lines are recommend by TF
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
import collections
import six
import math
import time
import random

from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops.math_ops import sigmoid
from tensorflow.python.ops.math_ops import tanh

# Our libraries
import ntm
import seqhelper
import learnfuncs

In [2]:
##############
# GLOBAL FLAGS

use_model             = 'ntm' # ntm, pattern_ntm, pattern_ntm_alt
task                  = 'copy' # copy, repeat copy, pattern
epoch                 = 100 # number of training epochs, default to 200
num_classes           = 258 # number of symbols, INCLUDING initial and terminal symbols
N                     = 5 # length of input sequences for training, default to 20, INCLUDING initial and terminal symbols
Ntest                 = 5 # length of sequences for testing, default to N, INCLUDING initial and terminal symbols
batch_size            = 500 # default to 500 (too large does not fit on GPUs)
controller_state_size = 100 # dimension of the internal state space of the controller, default 100
memory_address_size   = 128 # number of memory locations, default 20
memory_content_size   = 20 # size of vector stored at a memory location, default 5
powers_ring1          = [0,-1,1] # powers of R used on ring 1, default [0,-1,1]
powers_ring2          = [0,-1,1] # powers of R used on ring 2, default [0,-1,1]
model_optimizer       = 'rmsprop' # adam, rmsprop, default to rmsprop
LOG_DIR               = '/tmp/log' # default /tmp/log

training_percent      = 0.01 # percentage used for training, default 0.01
num_training          = 10000 #int(training_percent * (num_classes-2)**N)
num_test              = num_training

init_symbol           = num_classes - 2
term_symbol           = num_classes - 1

##########
# NOTES
#
# 1. Always put the zero power first in powers_ring since the code assumes this is there
# 2. The initial and terminal symbols are always from the end of the list of symbols, so they
# are respectively num_classes - 2 and num_classes - 1. So the number of symbols which are
# not initial or terminal is num_classes - 2

In [3]:
#######################
# SETUP TASKS
#
# Our sequences are of one-hot vectors, which we interpret as follows:
#
# [1.0, 0.0, 0.0] = 0
# [0.0, 1.0, 0.0] = 1
# [0.0, 0.0, 1.0] = 2 etc
#
# We write our sequences and functions referring to sequences of integers,
# and then convert to one-hot vectors for integration with TF.

# Below N_out and Ntest_out are the lengths of the outputs in both the training
# and testing regimes respectively. Since outputs do not include the initial and terminal
# symbols, these default to N - 2 and Ntest - 2 respectively.

###########
# COPY TASK
if( task == 'copy' ):
    func_to_learn = learnfuncs.f_identity
    N_out = N - 2
    Ntest_out = Ntest - 2

##################
# REPEAT COPY TASK
# put n zeros before the 1, for a copy task with n + 1 copies
if( task == 'repeat copy' ):
    pattern = [0,1]
    func_to_learn = lambda s: learnfuncs.f_repetitionpattern(s,pattern)
    N_out = 2 * (N - 2)
    Ntest_out = 2 * (Ntest - 2)

##############
# PATTERN TASK
if( task == 'pattern' ):
    pattern = [1,0,0,2,0]
    func_to_learn = lambda s: learnfuncs.f_repetitionpattern(s,pattern)
    N_out = 2 * (N - 2)
    Ntest_out = 2 * (Ntest - 2)

# Give an example input/output pair
a = [random.randint(0,num_classes-3) for i in range(N)]
fa = func_to_learn(a)

print("Under the chosen function, the sequence")
print(a)
print("is mapped to")
print(fa)

Under the chosen function, the sequence
[208, 60, 96, 155, 210]
is mapped to
[208, 60, 96, 155, 210]


In [4]:
####################
# INITIALISE STATE #
####################

one_hots = seqhelper.one_hot_vectors(num_classes)
input_size = num_classes # dimension of the input space I
state_size = 0

#####
# NTM
if( use_model == 'ntm' ):
    state_size = controller_state_size + 2*memory_address_size + memory_address_size * memory_content_size
    cell = ntm.NTM(state_size,input_size,controller_state_size,memory_address_size,memory_content_size, powers_ring1)
    
    ra = [0.0]*memory_address_size
    ra[0] = 1.0
    batch_address = np.zeros([batch_size,memory_address_size]) + ra
    
    # DEBUG at the moment the read and write addresses are not distributions, i.e. they do not
    # sum to 1, but after one step the gamma sharpening will normalise them. We should probably start
    # with things that sum to 1, though.
    #init_controller_state = tf.truncated_normal([batch_size, controller_state_size], 0.0, 1e-6, dtype=tf.float32)
    init_controller_state = tf.get_variable("init_ccs", shape=[batch_size, controller_state_size], initializer=tf.contrib.layers.xavier_initializer())
    
    init_read_address = tf.constant(batch_address,dtype=tf.float32,shape=[batch_size,memory_address_size]) + \
                       tf.random_uniform([batch_size, memory_address_size], 0.0, 1e-6)
    
    init_write_address = tf.constant(batch_address,dtype=tf.float32,shape=[batch_size,memory_address_size]) + \
                       tf.random_uniform([batch_size, memory_address_size], 0.0, 1e-6)
    
    #init_memory = tf.truncated_normal([batch_size, memory_address_size*memory_content_size], 0.0, 1e-6, dtype=tf.float32)
    init_memory = tf.get_variable("init_mem", shape=[batch_size, memory_address_size*memory_content_size], initializer=tf.contrib.layers.xavier_initializer())
    
    state = tf.concat([init_controller_state,init_read_address,init_write_address,init_memory],1)
    
#############
# PATTERN NTM
if( use_model == 'pattern_ntm' ):
    state_size = controller_state_size + 4*memory_address_size + \
                memory_address_size * memory_content_size + \
                memory_address_size * len(powers_ring1)

    cell = ntm.PatternNTM(state_size,input_size,controller_state_size,
                          memory_address_size,memory_content_size, powers_ring1, powers_ring2)
    
    state = tf.truncated_normal([batch_size, state_size], 0.0, 0.01, dtype=tf.float32)
    
#################
# PATTERN NTM ALT
if( use_model == 'pattern_ntm_alt' ):
    state_size = controller_state_size + 4*memory_address_size + \
                memory_address_size * memory_content_size + \
                memory_address_size * len(powers_ring1)

    cell = ntm.PatternNTM_alt(state_size,input_size,controller_state_size,
                          memory_address_size,memory_content_size, powers_ring1, powers_ring2)
    
    state = tf.truncated_normal([batch_size, state_size], 0.0, 0.01, dtype=tf.float32)


In [None]:
################
# DEFINE MODEL #
################

# inputs, we create N of them, each of shape [None,input_size], one for each position in the sequence
inputs = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(N)]
targets = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(N_out)]

# Used in order to flag that we share weights across iterations.
# Note that the training and test phases use all the same weights.
reuse = False

# Set up training graph
read_addresses = []
write_addresses = []
gamma_writes = []
gamma_reads = []
ss = []

for i in range(N):
    # Store read and write addresses for later logging
    h0, curr_read, curr_write, _ = tf.split(state, [controller_state_size,memory_address_size,memory_address_size,-1], 1)
    read_addresses.append(curr_read[0,:])
    write_addresses.append(curr_write[0,:])
    
    output, state = cell(inputs[i],state,'NTM',reuse)
    
    # DEBUG, getting gammas
    with tf.variable_scope("NTM",reuse=True):
        W_gamma_write = tf.get_variable("W_gamma_write", [controller_state_size,1])
        B_gamma_write = tf.get_variable("B_gamma_write", [])
        gamma_write = 1.0 + tf.nn.relu(tf.matmul(h0,W_gamma_write) + B_gamma_write) # shape [batch_size,1]
        
        W_gamma_read = tf.get_variable("W_gamma_read", [controller_state_size,1])
        B_gamma_read = tf.get_variable("B_gamma_read", [])
        gamma_read = 1.0 + tf.nn.relu(tf.matmul(h0,W_gamma_read) + B_gamma_read) # shape [batch_size,1]
        
        W_s = tf.get_variable("W_s", [controller_state_size,len(powers_ring1)])
        B_s = tf.get_variable("B_s", [len(powers_ring1)])
        s = tf.nn.softmax(tf.matmul(h0,W_s) + B_s) # shape [batch_size,len(powers)]

    gamma_writes.append(gamma_write[0,:])
    gamma_reads.append(gamma_read[0,:])
    ss.append(s[0,:])
    reuse = True

# We only start recording the outputs of the controller once we have
# finished feeding in the input. We feed terminal symbols as input in the second phase.

term_symbol_tensor = tf.constant(np.zeros([batch_size,input_size]) + one_hots[term_symbol],
                                 dtype=tf.float32,
                                 shape=[batch_size,input_size])

rnn_outputs = []
for i in range(N_out):
    output, state = cell(term_symbol_tensor,state,'NTM',reuse)
    rnn_outputs.append(output)

# Final fully connected layer
with tf.variable_scope("final_layer"):
    E = tf.get_variable("E",[controller_state_size,input_size])
    F = tf.get_variable("F",[input_size],initializer=init_ops.constant_initializer(0.0))

# prediction is a length N list of tensors of shape [None,input_size], where
# the jth row of prediction[d] is, for the jth input sequence in the batch,
# the probability distribution over symbols for the output symbol in position d.
logits = [tf.matmul(rnn_output, E) + F for rnn_output in rnn_outputs]
prediction = [tf.nn.softmax(logit) for logit in logits] 
ce = [tf.reduce_sum(targets[i] * tf.log(prediction[i])) for i in range(N_out)]

if( model_optimizer == 'adam' ):
    optimizer = tf.train.AdamOptimizer(1e-4)
elif( model_optimizer == 'rmsprop' ):
    optimizer = tf.train.RMSPropOptimizer(1e-4,decay=0.9,momentum=0.9)

cross_entropy = -tf.add_n(ce)
minimize = optimizer.minimize(cross_entropy)

mistakes = [tf.not_equal(tf.argmax(targets[i], 1), tf.argmax(prediction[i], 1)) for i in range(N_out)]
errors = [tf.reduce_mean(tf.cast(m, tf.float32)) for m in mistakes]

# Summaries
mean_error = tf.scalar_mul(np.true_divide(1,N_out), tf.add_n(errors))
tf.summary.scalar('error', mean_error)

# Initialise the model
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)

merged_summaries = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(LOG_DIR, sess.graph)

[<tf.Tensor 'gradients/NTM_12/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_11/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_10/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_8/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_6/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_4/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM_2/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]
[<tf.Tensor 'gradients/NTM/split_grad/concat:0' shape=(500, 2916) dtype=float32>, None, None]


In [None]:
############
# TRAINING #
############

random.seed()

pre_train_time = time.time()

# Training
no_of_batches = int(num_training/batch_size)

# An annoying thing here is that we cannot use a list as a key in a 
# dictionary. The workaround we found on StackOverflow here:
# http://stackoverflow.com/questions/33684657/issue-feeding-a-list-into-feed-dict-in-tensorflow)

# epoch is a global var
for i in range(epoch):
    for j in range(no_of_batches):
        inp = []
        out = []

        # We sample each batch on the fly from the set of all sequences
        for z in range(batch_size):
            # construct a sequence from 0,...,num_classes - 3 then append initial and terminal symbols
            a = [random.randint(0,num_classes-3) for k in range(N-2)]
            fa = func_to_learn(a)
            a = [init_symbol] + a + [term_symbol]
            a_onehot = [one_hots[e] for e in a]
            fa_onehot = [one_hots[e] for e in fa]
            inp.append(np.array(a_onehot))
            out.append(np.array(fa_onehot))        
        
        feed_dict = {}
        for d in range(N):
            in_node = inputs[d]
            # inp has dimensions [batch_size, N, num_classes] and we want to extract
            # the 2D Tensor of shape [batch_size, num_classes] obtained by setting the
            # second coordinate to d
            ti = []
            for k in range(batch_size):
                ti.append(inp[k][d])
            feed_dict[in_node] = np.array(ti)

        for d in range(N_out):
            out_node = targets[d]
            to = []
            for k in range(batch_size):
                to.append(out[k][d])
            feed_dict[out_node] = np.array(to)
        
        # for the first batch in an epoch, we have some logging
        if( j == 0 and i % 10 == 0 ):
            gamma_reads_val, gamma_writes_val, read_addresses_val, write_addresses_val = sess.run([gamma_reads,gamma_writes,read_addresses,write_addresses],feed_dict)
    
            s = 0
            for r in range(len(write_addresses_val)):
                print("")
                print("Step " + str(s) + " of the RNN run on the first input of first batch of this epoch")
                print("Read gamma  - " + str(gamma_reads_val[r]))
                print("Write gamma - " + str(gamma_writes_val[r]))
                print("w rotations - " + str(ss[r]))
                print("Write address -")
                print(write_addresses_val[r])
                print("Argmax - " + str(write_addresses_val[r].argmax()))
                print("")
                s = s + 1
        
        # Do gradient descent
        summary,_ = sess.run([merged_summaries,minimize], feed_dict)
        
        # Write out TensorBoard logs
        file_writer.add_summary(summary)
    current_mean = np.mean(sess.run(errors, feed_dict))
    print("Epoch - " + str(i+1) + ", Mean error of final batch in epoch - " + str(current_mean))
    
    # DEBUG
    #with tf.variable_scope("NTM",reuse=True):
    #    H = tf.get_variable("H", [controller_state_size,controller_state_size])
    #    print(sess.run(H))

# Write out variables to disk
saver = tf.train.Saver()
save_path = saver.save(sess,"/tmp/model.ckpt")
sess.close()

print("")
print("It took", time.time() - pre_train_time, "seconds to train.")


Step 0 of the RNN run on the first input of first batch of this epoch
Read gamma - [ 1.]
Write gamma - [ 1.05468929]
Write address -
[  1.00000012e+00   1.15296125e-07   7.47976401e-07   6.12190718e-07
   6.41264421e-07   8.66121042e-08   7.83794974e-07   1.38548970e-07
   4.65914724e-07   9.84654093e-07   7.81890890e-07   7.50882634e-07
   5.25193457e-07   4.21482213e-07   5.16507612e-07   2.37697364e-08
   1.19210128e-07   7.50960680e-07   8.96446124e-07   6.12471354e-07
   8.49252558e-07   6.19022273e-07   9.36312915e-07   7.93675554e-07
   2.18329191e-07   6.15960744e-07   6.57255555e-07   2.20934155e-07
   7.91735673e-08   8.83671419e-07   4.25691724e-07   5.51305163e-07
   5.59759712e-07   3.57143165e-07   8.45804209e-07   2.88852561e-07
   2.76220334e-07   5.25542589e-07   1.83848854e-07   1.03063584e-08
   7.42564055e-07   4.50999011e-07   2.01190474e-07   6.91551463e-07
   3.42148184e-07   2.97436486e-07   3.13754441e-07   2.69365898e-07
   6.12639440e-07   6.91537878e-07   7

In [None]:
###########
# TESTING #
###########

# Note that all the weights will be loaded from the saved training session
inputs_test = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(Ntest)]
targets_test = [tf.placeholder(tf.float32, [None,input_size]) for _ in range(Ntest_out)]
state_test = tf.truncated_normal([batch_size, state_size], 0.0, 0.01, dtype=tf.float32)

# Set up test graph
reuse = True
for i in range(Ntest):
    output, state = cell(inputs_test[i],state_test,'NTM',reuse)

rnn_outputs_test = []
for i in range(Ntest_out):
    output, state = cell(tf.zeros([batch_size,input_size]),state_test,'NTM',reuse)
    rnn_outputs_test.append(output)
    
with tf.variable_scope("final_layer",reuse=True):
    E = tf.get_variable("E",[controller_state_size,input_size])
    F = tf.get_variable("F",[input_size],initializer=init_ops.constant_initializer(0.0))

logits_test = [tf.matmul(rnn_output, E) + F for rnn_output in rnn_outputs_test]
prediction_test = [tf.nn.softmax(logit) for logit in logits_test] 
mistakes_test = [tf.not_equal(tf.argmax(targets_test[i], 1), tf.argmax(prediction_test[i], 1)) for i in range(Ntest_out)]
errors_test = [tf.reduce_mean(tf.cast(m, tf.float32)) for m in mistakes_test]

# Restore the weights from training
sess = tf.Session()
saver.restore(sess,save_path)

# DEBUG
#with tf.variable_scope("NTM",reuse=True):
#    H = tf.get_variable("H", [controller_state_size,controller_state_size])
#    print(sess.run(H))

#### RUN TEST ####

no_of_batches = int(num_test/batch_size)
#print("Number of batches: " + str(no_of_batches))

error_means = []
for j in range(no_of_batches):
    inp = []
    out = []

    # We sample each batch on the fly from the set of all sequences
    for z in range(batch_size):
        a = [random.randint(0,num_classes-3) for k in range(Ntest-2)]
        fa = func_to_learn(a)
        a = [init_symbol] + a + [term_symbol]
        a_onehot = [one_hots[e] for e in a]
        fa_onehot = [one_hots[e] for e in fa]
        inp.append(np.array(a_onehot))
        out.append(np.array(fa_onehot))        
        
    feed_dict = {}
    for d in range(Ntest):
        in_node = inputs_test[d]
        ti = []
        for k in range(batch_size):
            ti.append(inp[k][d])
        feed_dict[in_node] = np.array(ti)

    for d in range(Ntest_out):
        out_node = targets_test[d]
        to = []
        for k in range(batch_size):
            to.append(out[k][d])
        feed_dict[out_node] = np.array(to)
            
    current_mean = np.mean(sess.run(errors_test, feed_dict))
    error_means.append(current_mean)
    print("Batch - " + str(j+1) + ", Mean error - " + str(current_mean))

final_error = np.mean(error_means)

# The first three digits of this should match the printout for the
# first three test output sequences given earlier
#data = sess.run([tf.argmax(targets[0],1), tf.argmax(prediction[0],1)],feed_dict)

#print("First digits of test outputs (actual)")
#print(data[0])
#print("First digits of test outputs (predicted)")
#print(data[1])

# print the mean of the errors in each digit for the test set.
#incorrects = sess.run(errors, feed_dict)
# print(incorrects)

print("")        
print("###########")
print("# Summary #")
print("###########")
print("")
print("model         - " + use_model)
print("task name     - " + task)
print("num_classes   - " + str(num_classes))
print("N             - " + str(N))
print("N_out         - " + str(N_out))
print("Ntest         - " + str(Ntest))
print("Ntest_out     - " + str(Ntest_out))
print("ring 1 powers - " + str(powers_ring1))
print("ring 2 powers - " + str(powers_ring2))
print("# epochs      - " + str(epoch))
print("optimizer     - " + str(model_optimizer))
print("# weights     - " + str(ntm.count_number_trainable_params()))
print("(css,mas,mcs) - (" + str(controller_state_size) + "," + str(memory_address_size) + "," + str(memory_content_size) + ")")
print("train percent - " + str(training_percent))
print("num_training  - " + str(num_training) + "/" + str(num_classes**N))
print("num_test      - " + str(num_test) + "/" + str(num_classes**N))
print("")
print("")
print("error         - " + str(final_error))
sess.close()