In [1]:
import sys
sys.path.append('..')
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
import random
import time
import dill as pickle

from IPython.display import clear_output

import dali.core as D
import dali
from dali.data import Lines, Process, DiscoverFiles, BatchBenefactor, IdentityReducer
from dali.data.batch import TranslationBatch
from dali.data.translation import TranslationFiles, TranslationMapper, build_vocabs, iterate_examples
from dali.utils.scoring import bleu, ErrorTracker
from dali.utils import (
    Vocab,
    Solver,
    median_smoothing,
    subsample,
    Throttled,
    pickle_globals,
    unpickle_globals,
)
from dali import beam_search

%matplotlib inline

In [5]:
D.config.default_device = "gpu"

In [54]:
def generate_example(item_size, num_items, batch_size = 1):
    return [ D.Mat(np.random.binomial(1, 0.5, (batch_size, item_size)), dtype=np.float32) for _ in range(num_items) ]


def evaluate_copy(model, solver, input_size, max_train_seq_length, batch_size=1):    
    empty_input      = D.Mat.zeros((1,input_size), constant=True)
    start_prediction = D.Mat.ones((1,input_size), constant=True)
    
    def test():
        with D.NoBackprop():
            for test_len in [max_train_seq_length // 2, max_train_seq_length, max_train_seq_length * 2]:
                print ("    Testing sequence length %d: " % (test_len,))

                NUM_TRIES = 10

                num_bits_correct = 0

                for _ in range(NUM_TRIES):
                    example = generate_example(input_size, test_len)
                    state = model.initial_states()
                    for input_vec in example:
                        state = model.activate(input_vec, state)

                    state = model.activate(start_prediction, state)
                    for input_vec in example:
                        decoded = np.round(model.decode(state).sigmoid().w)
                        num_bits_correct += sum(sum(np.abs(decoded - input_vec.w)))
                        state = model.activate(empty_input, state)

                        
                print('        Cost per sequence: %f (chance: %f)' % (num_bits_correct / NUM_TRIES, 
                                                                      test_len * input_size / 2))
    
    params = model.parameters()
    
    not_too_often = Throttled(1)
    errors = []
    
    for epoch in range(100000):
        try:
            example = generate_example(input_size, random.randint(1, max_train_seq_length), BATCH_SIZE)
            state = model.initial_states()
            for input_vec in example:
                state = model.activate(input_vec, state)

            state = model.activate(start_prediction, state)
            error = D.Mat.zeros((1,1), constant=True)
            for input_vec in example:
                decoded = model.decode(state).sigmoid()
                
                error = error + ((decoded - input_vec)**2).sum()
                state = model.activate(empty_input, state)

            (error / BATCH_SIZE).grad()
            D.Graph.backward()
            solver.step(params)

            errors.append(error.w[0,0] / BATCH_SIZE)
            if not_too_often.should_i_run():
                recent_error = sum(errors[-20:]) / len(errors[-20:]) 
                print('Epoch %d (%d processed examples), error: %f' % (epoch, epoch*BATCH_SIZE, recent_error))
        
            if epoch % (10000 // BATCH_SIZE) == 0:
                test()
        except KeyboardInterrupt:
            print("Early stopping")
            test()
            break
    return errors
    

In [55]:
class LSTMBaseline(object):
    def __init__(self, input_size, hiddens, output_size):
        self.lstm    = D.StackedLSTM(input_size, hiddens, memory_feeds_gates=True)
        self.decoder = D.Layer(sum(hiddens), output_size)
        
    def activate(self, input_vec, previous_state):
        return self.lstm.activate(input_vec, previous_state)
    
    def decode(self, state):
        return self.decoder.activate(D.MatOps.hstack([s.hidden for s in state]))
    
    def parameters(self):
        return self.lstm.parameters() + self.decoder.parameters()
    
    def initial_states(self):
        return self.lstm.initial_states()

In [60]:
INPUT_SIZE           = 8
MAX_TRAIN_SEQ_LENGTH = 10


HIDDENS  = [256, 256, 256]
BATCH_SIZE  = 20
baseline_model = LSTMBaseline(INPUT_SIZE, HIDDENS, INPUT_SIZE)
##solver = Solver("rmsprop", learning_rate=0.00003, decay_rate=0.9, smooth_eps=1e-4)
solver = Solver("rmsprop", learning_rate=0.0002, decay_rate=0.9, smooth_eps=1e-4, clipval=10.0)

errors = evaluate_copy(baseline_model, solver, INPUT_SIZE, MAX_TRAIN_SEQ_LENGTH, batch_size=BATCH_SIZE)

Epoch 0 (0 processed examples), error: 2.001773
    Testing sequence length 5: 
        Cost per sequence: 21.000000 (chance: 20.000000)
    Testing sequence length 10: 
        Cost per sequence: 41.600000 (chance: 40.000000)
    Testing sequence length 20: 
        Cost per sequence: 79.000000 (chance: 80.000000)
Epoch 1 (20 processed examples), error: 1.996219
Epoch 8 (160 processed examples), error: 10.667389
Epoch 15 (300 processed examples), error: 11.249374
Epoch 25 (500 processed examples), error: 10.499165
Epoch 34 (680 processed examples), error: 10.001463
Epoch 41 (820 processed examples), error: 10.909890
Epoch 48 (960 processed examples), error: 11.013786
Epoch 55 (1100 processed examples), error: 13.018217
Epoch 63 (1260 processed examples), error: 12.610306
Epoch 72 (1440 processed examples), error: 10.196353
Epoch 79 (1580 processed examples), error: 10.492700
Epoch 86 (1720 processed examples), error: 10.995434
Epoch 96 (1920 processed examples), error: 10.094590
Epoch

In [29]:
def consine_similarity(A,B):
    # TODO: replace with the normalizer
    return (A * B).sum(axis=2)

class NTMAddressing(object):
    def __init__(self, input_sizes, memory_locations, memory_size):
        self.memory_locations = memory_locations
        self.content_key          = D.StackedInputLayer(input_sizes, memory_size)
        self.content_key_strength = D.StackedInputLayer(input_sizes, 1)
        self.interpolation        = D.StackedInputLayer(input_sizes, 1)
        self.shift                = D.StackedInputLayer(input_sizes, memory_size)
        self.focus                = D.StackedInputLayer(input_sizes, 1)

    def address(self, inputs, memory, state):
        """Outputs memory location weights.
        
        Inputs:
        inputs -- set of vectors controlling the mechanism (e.g. LSTM output)
        state  -- weights from previous timestep.
        """
        # todo - should memory contents be tanhed?
        key                  = self.content_key.activate(inputs).tanh()
        key_strength         = self.content_key_strength.activate(inputs).exp()
        # todo - make multiplication of similarity (vector) * key strength_scalrar is correctly broadcasted
        content_weights      = D.softmax(D.MatOps.consine_similarity(key, memory) * key_strength)
        interpolation_gate   = self.interpolation.activate(inputs).sigmoid()
        interpolated_weights = interpolation_gate * content_weights + (1.0 - interpolation_gate) * state
        shift                = D.softmax(self.shift.activate(inputs))
        shifted_weighs       = D.MatOps.circular_convolution(interpolated_weights, shift)
        focus                = D.MatOps.softplus(self.shift.activate(inputs)) + 1.
        focused_weights      = shifted_weighs ** focus
        # todo - make sure it's correctly broadcasted
        sum_focused          = focused_weights.sum(axis=2)
        focused_weights      = focused_weights / D.MatOps.broadcast(sum_focused, axis=1, num_replicas=focused_weights.dims(0))
        return focused_weights
    
    def initial_states(self):
        return D.Mat(1, self.memory_locations)
    
    def parameters(self):
        res = []
        res.extend(self.content_key.parameters())
        res.extend(self.content_key_strength.parameters())
        res.extend(self.interpolation.parameters())
        res.extend(self.shift.parameters())
        res.extend(self.focus.parameters())
        return res

class NTMReadHead(object):
    def __init__(self, input_sizes, memory_locations, memory_size):
        self.addressing = NTMAddressing(input_sizes, memory_locations, memory_size)
    
    def activate(inputs, memory, state):
        weights = self.addressing.address(inputs, memory, state)
        # todo - make sure it is correctly broadcasted
        return (weights.T() * memory).sum(axis=1), weights
    
    def initial_states():
        return self.addressing.initial_states()
    
    def parameters(self):
        return self.addressing.parameters()


class NTMWriteHead(object):
    def __init__(self, input_sizes, memory_locations, memory_size):
        self.addressing = NTMAddressing(input_sizes, memory_locations, memory_size)
        self.content    = D.StackedInputLayer(input_sizes, memory_size)
        self.erase      = D.StackedInputLayer(input_sizes, memory_locations)
    def activate(inputs, memory, state):
        weights = self.addressing.address(inputs, memory, state)
        # todo - make sure it is correctly broadcasted
        new_content = self.content.activate(inputs).tanh()
        erase       = self.erase.activate(inputs).sigmoid()
        memory = memory * (1.0 - weight * erase)
        # todo - make sure it is an outer product
        memory = memory + weights.T() * content
        return memory, weights
    
    def initial_states():
        return self.addressing.initial_states()
    
    def parameters(self):
        return self.addressing.parameters()
    

In [30]:
class NTM(object):
    def __init__(self, input_size, hiddens, output_size, 
                 num_read_heads, num_write_heads,
                 memory_locations, memory_size):
        self.memory_locations = memory_locations
        self.memory_size      = memory_size
        
        controller_inputs = [input_size] + [memory_size for _ in range(num_read_heads)]
        self.controller   = D.StackedLSTM(controller_inputs, hiddens, memory_feeds_gates=True)
        self.decoder      = D.StackedInputLayer(hiddens, output_size)
        self.read_heads   = [NTMReadHead (hiddens, memory_locations, memory_size) for _ in range(num_read_heads) ]
        self.write_heads  = [NTMWriteHead(hiddens, memory_locations, memory_size) for _ in range(num_write_heads)]

    def activate(self, input_vec, state):
        # read
        new_read_heads_state = []
        read_from_memory     = []
        for head, head_state in zip(self.read_heads, state['read_heads']):
            content, new_head_state = head.read(self.extract_hiddens(state['controller']), state['memory'], head_state)
            read_from_memory.append(content)
            new_read_heads_state.append(new_head_state)
        # activate
        new_controller_state = self.controller.activate([input_vec] + read_from_memory, state['controller'])
        # write
        new_memory = state['memory']
        new_write_heads_state = []
        for head, head_state in zip(self.write_heads, state['write_heads']):
            new_memory, new_head_state = head.read(self.extract_hiddens(new_controller_state), new_memory, head_state)
            new_write_heads_state.append(new_head_state)
        # collect results
        new_state = {
            'controller'           : new_controller_state,
            'read_heads'           : new_read_heads_state,
            'write_heads'          : new_write_heads_state,
            'memory'               : new_memory,
        }
        return new_state
    
    def decode(self, state):
        return self.decoder.activate(self.extract_hiddens(state['controller']))
        
    def parameters(self):
        res = []
        res.extend(self.controller.parameters())
        res.extend(self.decoder.parameters())
        for head in self.read_heads:
            res.extend(head.parameters())
        for head in self.write_heads:
            res.extend(head.parameters())
        return res
    
    def extract_hiddens(self, state):
        return [s.hidden for s in state]
    
    def initial_states(self):
        controller_initial = self.controller.initial_states()
        return {
            'controller':            controller_initial,
            'read_heads':            [head.initial_states() for head in self.read_heads],
            'write_heads':           [head.initial_states() for head in self.write_heads],
            'memory':                D.Mat.zeros(self.memory_locations, self.memory_size)
        }

In [4]:
x = NTM(8, [100,], 8, 1, 1, 128, 20)

NameError: name 'NTM' is not defined

In [5]:
sum([sum(p.dims()) for p in x.parameters()])

NameError: name 'x' is not defined

In [3]:
A = D.Mat([[1,1,1],[2,2,2]])
B = D.Mat([[1,2,3]])

In [7]:
print(A.w)
print(B.w)
A/ D.MatOps.broadcast(B, axis=0, num_replicas=2)

[[ 1.  1.  1.]
 [ 2.  2.  2.]]
[[ 1.  2.  3.]]


[
    [  1.000   0.500   0.333]
    [  2.000   1.000   0.667]
]

In [14]:
A.w.sum(axis=0)

array([ 3.,  3.,  3.], dtype=float32)