In [1]:
%load_ext autoreload
%autoreload 2
from nram import *
from opt import *
from gate import *
from data import *
from nnet import *

In [2]:
import numpy as np
import theano

In [3]:
theano.config.optimizer = 'fast_run'

In [4]:
num_registers = 2
num_timesteps = 2
max_int = 10
gates = [one, zero, add, write, read]
gate_names = ["1", "0", "Write", "Read"]

In [5]:
## Goal task: delete the first character from a null-terminated string.

## Task 1: Check whether the first character is a null-terminator.
## Task 3: Shift all characters over until the null terminator
def task_one(max_int, batch_size):
    # Create a random initial memory.
    init_mem = np.random.randint(0, max_int, size=(batch_size, max_int), dtype=np.int32)
    
    # Seed half of it with input zeros.
    seed = np.random.choice([0, 1], size=(batch_size, 1), p=[0.44, 0.56])
    init_mem[:, 0] *= seed[:, 0]
    
    # Generate an output.
    out_mem = init_mem.copy()
    out_mem[:, 1] = np.where(init_mem[:, 0] == 0, 1, 0)
    
    return init_mem, out_mem

## Task 2: Find the null terminator.
def task_two(max_int, batch_size):
    # Create a random initial memory.
    init_mem = np.random.randint(1, max_int, size=(batch_size, max_int), dtype=np.int32)
    
    # Choose where to put the zeros.
    locs = np.random.choice(range(min(num_timesteps - 1, max_int - 1)), size=(batch_size, ))
    init_mem[np.arange(batch_size), locs] = 0
    
    # Generate an output.
    out_mem = init_mem.copy()
    out_mem[np.arange(batch_size), locs + 1] = 1
    
    return init_mem, out_mem

def generate_batch(max_int, batch_size=1000, entropy=0.1, entropy_decay=0.999):
    def make_batch(timestep):
        entropy_weight = entropy * entropy_decay ** timestep
        entropy_weight = 0.0
        init_mem, out_mem = task_one(max_int, batch_size)
        cost_mask = np.ones((batch_size, max_int), dtype=np.int8)
        return encode(init_mem, max_int), out_mem, cost_mask, entropy_weight
    return make_batch

In [6]:
layer_sizes = []
params = list(mlp_weights(num_registers, layer_sizes, gates))

reg_lambda = 0
result = run(gates, num_registers, max_int, num_timesteps, len(layer_sizes), reg_lambda, params)
debug, init_mem, desired_mem, cost_mask, final_mem, final_cost, entropy_weight = result

gradients = theano.grad(final_cost, params)
train = theano.function([init_mem, desired_mem, cost_mask, entropy_weight], [final_cost] + gradients)
predict = theano.function([init_mem], final_mem)

keys = list(debug.keys())
values = [debug[k] for k in keys]
predict_instrumented = theano.function([init_mem, desired_mem, cost_mask, entropy_weight], values)
def predict_debug(*args):
    return dict(zip(keys, predict_instrumented(*args)))

In [869]:
adam_optimize(params, generate_batch(max_int), train)

Cost (t =    0): 	2856.09
Cost (t =  100): 	2636.21
Cost (t =  200): 	2333.77
Cost (t =  300): 	2079.60
Cost (t =  400): 	1802.07
Cost (t =  500): 	1571.54
Cost (t =  600): 	1343.44
Cost (t =  700): 	1143.21
Cost (t =  800): 	982.25
Cost (t =  900): 	865.61
Cost (t = 1000): 	785.96
Cost (t = 1100): 	739.75
Cost (t = 1200): 	715.50
Cost (t = 1300): 	704.61
Cost (t = 1400): 	700.32
Cost (t = 1500): 	697.40
Cost (t = 1600): 	696.27
Cost (t = 1668): 	696.30


In [780]:
b = generate_batch(max_int)
inputs, outputs, mask = b(1500)
percent_correct(predict, inputs, outputs)

10.0

In [785]:
inputs[0, :, :].argmax(axis=1)

array([0, 6, 5, 8, 1, 3, 7, 4, 1, 4])

In [782]:
outputs[0, :]

array([0, 1, 5, 8, 1, 3, 7, 4, 1, 4], dtype=int32)

In [718]:
r = predict_debug(inputs, outputs, mask)

In [732]:
def inspect(debug, sample, num_timesteps, num_registers, gate_names, gates):
    """Utility for inspecting the result of the network."""
    
    def get(name, timestep):
        return debug["%d:%s" % (timestep, name)][sample, :].argmax()
    
    def fmt(i):
        if i in range(num_registers):
            return "R" + str(i + 1)
        else:
            return "G" + str(i - num_registers)
        
    output = ""
    output += "Init: %s\n" % debug["0:gate-mem-0"][sample, :, :].argmax(axis=1)
    for timestep in range(num_timesteps):
        output += "Timestep %d:\n" % timestep
        for r in range(num_registers):
            src = fmt(get("coeff-reg-%d" % r, timestep))
            val = get("reg-%d" % r, timestep)
            output += "\tR%d' = %s\t\t%d\n" % (r, src, val)
        for g, (name, gate) in enumerate(zip(gate_names, gates)):
            src = ", ".join(fmt(get("coeff-gate-%d/%d" % (g, a), timestep)) for a in range(gate.arity))
            val = get("gate-out-%d" % g, timestep)
            output += "\tG%d' = %s(%s)\t\t%d\n" % (g, name, src, val)
        output += "\tComplete -> %.3f\n" % debug["%d:complete" % timestep][sample, 0]
        output += "\tMem -> %s\n" % str(debug["%d:gate-mem-%d" % (timestep, g)][sample, :, :].argmax(axis=1))
    return output

In [759]:
x += 1
print(inspect(r, x, num_timesteps, num_registers, gate_names, gates))

Init: [4 8 8 2 2 7 2 5 9 8]
Timestep 0:
	R0' = G1		0
	R1' = G3		4
	G0' = 1()		1
	G1' = 0()		0
	G2' = Write(G0, G0)		1
	G3' = Read(R2)		4
	Complete -> 0.000
	Mem -> [4 1 8 2 2 7 2 5 9 8]
Timestep 1:
	R0' = G1		0
	R1' = G3		2
	G0' = 1()		1
	G1' = 0()		0
	G2' = Write(G0, G1)		0
	G3' = Read(R2)		2
	Complete -> 0.000
	Mem -> [4 0 8 2 2 7 2 5 9 8]

