Numeric LSTM
=====

Essa é a implementação de uma LSTM memorizar uma representação de números de vários algarismos. Números são lidos um dígito de cada vez, de modo que a LSTM aprenda uma representação interna para quantidades. Em seguida, uma outra LSTM decodifica a célula de memória, gerando os números originais.

Funcionamento geral
----

1. LSTM encoder codifica a sequência de números.

2. LSTM decoder gera a saída, um algarismo de cada vez (é aplicado um softmax sobre a saída da rede)

Obs:

- Embeddings são compartilhadas, mas o encoder e decoder têm seus próprios parâmetros


In [145]:
import tensorflow as tf
import numpy as np

from __future__ import division

In [146]:
# a number with 8 digits most and END
output_sequence_size = 9

# digits 0-9
encoder_vocab = 10

# digits 0-9 and END symbol
decoder_vocab = 11

# digits 0-9, GO and END symbols
vocab_size = 12

class Symbol(object):
    """
    Placeholder class for values used in the RNNs.
    """
    END = 10
    GO = 11

Criação do grafo
---

**Código abaixo reseta o grafo**

In [523]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

In [524]:
embedding_size = 300
num_lstm_units = embedding_size

# a number with up to 9 digits
input_sequence_size = 9

max_sample_size = 20

graph = tf.Graph()

first_term = tf.placeholder(tf.int32, [input_sequence_size, None], 'first_term')
first_term_size = tf.placeholder(tf.int32, [None], 'first_term_size')

# we want to share the embeddings between encoder and decoder, but not all parameters
shape = [vocab_size, embedding_size]
embeddings = tf.Variable(tf.random_uniform(shape, -1.0, 1.0), name='embeddings')

lstm_initializer = tf.random_uniform_initializer(-0.1, 0.1)
lstm_cell = tf.nn.rnn_cell.LSTMCell(num_lstm_units, embedding_size, 
                                    initializer=lstm_initializer)

with tf.variable_scope('output_softmax') as softmax_scope:
    # softmax to map decoder raw output to digits
    shape = [num_lstm_units, decoder_vocab]
    initializer = tf.truncated_normal_initializer(0.0, 0.1)
    softmax_weights = tf.get_variable('weights', shape, tf.float32, initializer)
    
    initializer = tf.zeros_initializer([decoder_vocab])
    softmax_bias = tf.get_variable('bias', initializer=initializer)

def generate_rnn_input(sequence_indices, num_time_steps):
    """
    Generate the embedding input to the RNN from a tensor
    of shape [sequence_size, batch_size].
    
    If trim is not None, reduce the first dimension of the sequences
    to the given value.
    
    Return a list of tensors of shape [batch_size, embedding_size]
    """
    embedded_sequence =  tf.nn.embedding_lookup(embeddings, sequence_indices)
    return [tf.squeeze(time_step, [0]) 
            for time_step in tf.split(0, num_time_steps, embedded_sequence)]
    

input_1st_term = generate_rnn_input(first_term, input_sequence_size)

with tf.variable_scope('encoder') as encoder_scope:
    _, state_1st_term = tf.nn.rnn(lstm_cell, input_1st_term, 
                                  sequence_length=first_term_size, dtype=tf.float32)

# create a tensor of 1's with the appropriate size and then multiply it by GO embeddings
ones = tf.ones_like(input_1st_term[0])
embedded_go = tf.nn.embedding_lookup(embeddings, Symbol.GO)
batch_embedded_go = ones * embedded_go

input_as_list = [tf.squeeze(time_step)
                 for time_step in tf.split(0, input_sequence_size, first_term)]
decoder_inputs = [batch_embedded_go] + input_1st_term

# the END symbol is just a label; doesn't need embeddings
ones = tf.ones_like(input_as_list[0])
batch_end = ones * Symbol.END
decoder_labels = input_as_list + [batch_end]

# label_weights is just used to weight the importance of each class
label_weights = [tf.ones_like(decoder_labels[0], dtype=tf.float32)
                 for _ in decoder_labels]

with tf.variable_scope('decoder') as decoder_scope:
    raw_outputs, _ = tf.nn.seq2seq.rnn_decoder(decoder_inputs, state_1st_term, 
                                               lstm_cell)

def project_output(raw_outputs, return_softmax=False):
    """
    Multiply the raw_outputs by a weight matrix, add a bias and return the
    softmax distribution or the logits.
    
    :param return_softmax: if True, return the softmaxes. If False, return
        the logits
    """
    output_logits = [tf.nn.xw_plus_b(time_step, softmax_weights, softmax_bias)
                     for time_step in raw_outputs]
    
    if not return_softmax:
        return output_logits
    
    output_softmax = [tf.nn.softmax(time_step) for time_step in output_logits]
    return output_softmax

output_logits = project_output(raw_outputs, False)

Optimizer
---

In [525]:
learning_rate = tf.placeholder(tf.float32, name='learning_rate')
l2_constant = tf.placeholder(tf.float32, name='l2_constant')

labeled_loss = tf.nn.seq2seq.sequence_loss(output_logits, decoder_labels, label_weights)
l2_loss = l2_constant * tf.nn.l2_loss(softmax_weights)
loss = labeled_loss + l2_loss

global_step = tf.Variable(0, name='global_step', trainable=False)

optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=0.1)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)

train_op = optimizer.apply_gradients(zip(gradients, v), 
                                     global_step=global_step)

Execution of new inputs 
---

Run the encoder for one single step.

`next_symbol` and `decoder_new_state` should be called for each step inside a loop.

In [526]:
# we use the same intermediate results of the training part until the 
# encoder hidden state
digit_step = tf.placeholder(tf.int32, [None], 'digit_step')
decoder_step_state = tf.placeholder(tf.float32, [None, lstm_cell.state_size], 
                                    'decoder_step_state')

# embed the input digits
decoder_step_input = [tf.nn.embedding_lookup(embeddings, digit_step)]

with tf.variable_scope(decoder_scope) as exec_time_decoder:
    exec_time_decoder.reuse_variables()
    decoder_step_output, decoder_new_state = tf.nn.seq2seq.rnn_decoder(decoder_step_input, 
                                                                       decoder_step_state, lstm_cell)
    
    step_logits = tf.nn.xw_plus_b(decoder_step_output[0], softmax_weights, softmax_bias)
    next_symbol = tf.argmax(step_logits, 1)
    

In [469]:
def generate_sequences(array_size, sequence_size, batch_size):
    """
    Generate a sequence of numbers as an array.
    
    All sequences must have the same size. The array is filled with 
    END symbols as necessary.
    
    :param array_size: the array size expected by the encoder/decoder
    :param sequence_size: the number of items (digits) actually
        contained in the sequences. After that many entries, the array
        is filled with the END symbol.
    :param batch_size: number of sequences
    """
    dims = (array_size, batch_size)
    sequences = np.random.random_integers(0, 9, dims)
    sequences[sequence_size:] = Symbol.END
    
    return sequences


def run_network(input_sequence, sequence_size):
    """
    Create a numpy array with the decoder outputs
    """
    answer = []
    batch_size = input_sequence.shape[1]
    current_symbol = [Symbol.GO] * batch_size
    
    encoder_feeds = {first_term: input_sequence, 
                     first_term_size: sequence_size}
    hidden_state = sess.run(state_1st_term, feed_dict=encoder_feeds)
    
    # this array control which sequences have already been finished by the
    # decoder, i.e., for which ones it already produced the END symbol
    sequences_done = np.zeros(batch_size, dtype=np.bool)
    while True:
        decoder_feeds = {decoder_step_state: hidden_state,
                         digit_step: current_symbol}

        fetches = sess.run([next_symbol, decoder_new_state], 
                           feed_dict=decoder_feeds)
        current_symbol, hidden_state = fetches
        
        # use an "additive" or in order to avoid infinite loops
        sequences_done |= (current_symbol == Symbol.END)
                
        if sequences_done.all():
            break

        answer.append(current_symbol)
    
    return np.vstack(answer)

Generate train and validation sets 
---

We could just keep generating random data to train and evaluate the models, but using a predefined set allows us to compare different models over a common baseline.

In [470]:
def remove_duplicates(x):
    """
    Return a copy of the array x without duplicate columns
    """
    order = np.lexsort(x)
    ordered = x[:, order]
    diffs = np.diff(ordered, axis=1)
    diff_sums = np.sum(np.abs(diffs), 0)
    unique_indices = np.ones(x.shape[1], dtype='bool')
    unique_indices[1:] = diff_sums != 0
    unique_x = ordered[:, unique_indices]
    
    return unique_x
    

def shuffle_data_and_sizes(data, sizes):
    """
    Convenient function for shuffling a dataset and its sizes with the same
    RNG state.
    """
    rng_state = np.random.get_state()
    np.random.shuffle(data.T)
    np.random.set_state(rng_state)
    np.random.shuffle(sizes)
    
    
def generate_dataset(array_size, num_sequences, return_sizes=True):
    """
    Generate one dataset as a 2-dim numpy array
    
    :param array_size: the array size expected by the network
    :param num_sequences: the total number of sequences (columns) in the result
    :param return_sizes: if True, returns a tuple with the dataset and
        a 1-d array with the size of each sequence
    """
    data = np.random.random_integers(0, 9, (array_size, num_sequences))
    seq_sizes = np.empty(num_sequences, dtype=np.int)
    
    possible_sizes = np.arange(1, array_size + 1)
    exps = np.exp(possible_sizes)
    proportions = exps / np.sum(exps) * num_sequences
    proportions = np.ceil(proportions).astype(np.int)
    
    last_idx = 0
    for i, prop in enumerate(proportions, 1):
        until_idx = last_idx + prop
        
        data[i:, last_idx:until_idx] = Symbol.END
        seq_sizes[last_idx:until_idx] = i
        
        last_idx = until_idx
    
    if return_sizes:
        return (data, seq_sizes)
    
    return data

In [452]:
def get_accuracy(data, sizes, ignore_end=True):
    """
    Get the prediciton accuracy on the supplied data.
    
    :param ignore_end: if True, ignore the END symbol
    """
    answer = run_network(data, sizes)
    
    # if the answer is longer than it should, truncate it
    if len(answer) > len(data):
        answer = answer[:len(data)]
    
    hits = answer == data
    total_items = answer.size
        
    if ignore_end:
        non_end = data != Symbol.END
        hits_non_end = hits[non_end]
        total_items = np.sum(non_end)
        
    acc = np.sum(hits_non_end) / total_items
    return acc
    

In [533]:
get_accuracy(valid_set, valid_sizes)

0.80806278986799862

Training
----

In [527]:
# get the data
train_size = 32000
valid_size = 1000

total_data = train_size + valid_size
data, sizes = generate_dataset(input_sequence_size, total_data)
shuffle_data_and_sizes(data, sizes)

# removing duplicates must be change to account for the sizes.... at any rate, 
# we were getting 5 duplicates out of 32k. i don't think we really need it
#data = remove_duplicates(data)
train_set = data[:, :train_size]
valid_set = data[:, train_size:]
train_sizes = sizes[:train_size]
valid_sizes = sizes[train_size:]

n_train = train_set.shape[1]
n_valid = valid_set.shape[1]
print 'Training with %d sequences; %d for validation' % (n_train, n_valid)


Training with 32000 sequences; 1000 for validation


Set some important variables
---

In [531]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
batch_size = 32
num_epochs = 1

accumulated_loss = 0
report_interval = 100
save_path = '../checkpoints/basic-memorizer.dat'

num_batches = int(n_train / batch_size)

saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=1)

Actual training
---

In [519]:


for epoch_num in range(num_epochs):
    # loop for epochs - each one goes through the whole dataset
    shuffle_data_and_sizes(train_set, train_sizes)
    last_batch_idx = 0
    
    for batch_num in range(num_batches):
        batch_idx = last_batch_idx + batch_size
        batch = train_set[:, last_batch_idx:batch_idx]
        sizes = train_sizes[last_batch_idx:batch_idx]
        last_batch_idx = batch_idx
    
    #     # a single size for all items in the input
    #     sequence_size = np.random.random_integers(1, input_sequence_size)
    #     sequences = generate_sequences(input_sequence_size, sequence_size, batch_size)
    #     sequence_size_array = np.array([sequence_size] * batch_size)
    
        feeds = {first_term: batch, 
                 first_term_size: sizes,
                 l2_constant: 0.0001, 
                 learning_rate: 0.1}

        _, loss_value = sess.run([train_op, loss], feed_dict=feeds)
        accumulated_loss += loss_value
    
        if (batch_num + 1) % report_interval == 0:
            print('Epoch %d, batch %d' % (epoch_num + 1, batch_num + 1))
            avg_loss = accumulated_loss / report_interval
            print('Train loss: %.5f' % avg_loss)
            accumulated_loss = 0
            
            valid_acc = get_accuracy(valid_set, valid_sizes)
            print('Validation accuracy: %f' % valid_acc)
            
        
    print('')

# sess.close()

Epoch 1, batch 100
Train loss: 2.00596
Validation accuracy: 0.223626
Epoch 1, batch 200
Train loss: 1.09714
Validation accuracy: 0.383359
Epoch 1, batch 300
Train loss: 0.69080
Validation accuracy: 0.480153
Epoch 1, batch 400
Train loss: 0.55724
Validation accuracy: 0.576350
Epoch 1, batch 500
Train loss: 0.47874
Validation accuracy: 0.614734
Epoch 1, batch 600
Train loss: 0.41278
Validation accuracy: 0.630111
Epoch 1, batch 700
Train loss: 0.36679
Validation accuracy: 0.728931
Epoch 1, batch 800
Train loss: 0.30472
Validation accuracy: 0.752414
Epoch 1, batch 900
Train loss: 0.27869
Validation accuracy: 0.776851
Epoch 1, batch 1000
Train loss: 0.23319
Validation accuracy: 0.807963



Saving
---

In [520]:
saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=1)
saver.save(sess, save_path)

'../checkpoints/basic-memorizer.dat'

Loading
---

In [532]:
saver.restore(sess, save_path)

In [516]:
[v.name for v in tf.trainable_variables()]

[u'embeddings:0',
 u'output_softmax/weights:0',
 u'output_softmax/bias:0',
 u'encoder/RNN/LSTMCell/W_0:0',
 u'encoder/RNN/LSTMCell/B:0',
 u'decoder/rnn_decoder/LSTMCell/W_0:0',
 u'decoder/rnn_decoder/LSTMCell/B:0',
 u'Variable:0']

In [509]:
#tf.get_variable_scope().reuse_variables()
x = tf.get_variable('encoder/RNN/LSTMCell/W_0')

# with tf.variable_scope(encoder_scope, reuse=True) as scope:
#     with tf.variable_scope

In [420]:
get_accuracy(valid_set, valid_sizes)

0.21284425451092118