Numeric LSTM
=====

Essa é a implementação de uma LSTM memorizar uma representação de números de vários algarismos. Números são lidos um dígito de cada vez, de modo que a LSTM aprenda uma representação interna para quantidades. Em seguida, uma outra LSTM decodifica a célula de memória, gerando os números originais.

Funcionamento geral
----

1. LSTM encoder codifica a sequência de números.

2. LSTM decoder gera a saída, um algarismo de cada vez (é aplicado um softmax sobre a saída da rede)

Obs:

- Embeddings são compartilhadas, mas o encoder e decoder têm seus próprios parâmetros


In [139]:
import tensorflow as tf
import numpy as np

from __future__ import division

In [2]:
# a number with 8 digits most and END
output_sequence_size = 9

# digits 0-9
encoder_vocab = 10

# digits 0-9 and END symbol
decoder_vocab = 11

# digits 0-9, GO and END symbols
vocab_size = 12

class Symbol(object):
    """
    Placeholder class for values used in the RNNs.
    """
    END = 10
    GO = 11

Criação do grafo
---

**Código abaixo reseta o grafo**

In [140]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

In [141]:
embedding_size = 300
num_lstm_units = embedding_size

# a number with up to 9 digits
input_sequence_size = 9

max_sample_size = 20

graph = tf.Graph()

first_term = tf.placeholder(tf.int32, [input_sequence_size, None], 'first_term')
first_term_size = tf.placeholder(tf.int32, [None], 'first_term_size')

# we want to share the embeddings between encoder and decoder, but not all parameters
shape = [vocab_size, embedding_size]
embeddings = tf.Variable(tf.random_uniform(shape, -1.0, 1.0), name='embeddings')

lstm_initializer = tf.random_uniform_initializer(-0.1, 0.1)
lstm_cell = tf.nn.rnn_cell.LSTMCell(num_lstm_units, embedding_size, 
                                    initializer=lstm_initializer)

with tf.variable_scope('output_softmax') as softmax_scope:
    # softmax to map decoder raw output to digits
    shape = [num_lstm_units, decoder_vocab]
    softmax_weights = tf.Variable(tf.truncated_normal(shape, 0.0, 0.1))
    softmax_bias = tf.Variable(tf.zeros([vocab_size]))    

def generate_rnn_input(sequence_indices, num_time_steps):
    """
    Generate the embedding input to the RNN from a tensor
    of shape [sequence_size, batch_size]
    
    Return a list of tensors of shape [batch_size, embedding_size]
    """
    embedded_sequence =  tf.nn.embedding_lookup(embeddings, sequence_indices)
    return [tf.squeeze(time_step, [0]) 
            for time_step in tf.split(0, num_time_steps, embedded_sequence)]
    

input_1st_term = generate_rnn_input(first_term, input_sequence_size)

with tf.variable_scope('encoder') as encoder_scope:
    _, state_1st_term = tf.nn.rnn(lstm_cell, input_1st_term, 
                                  sequence_length=first_term_size, dtype=tf.float32)

# create a tensor of 1's with the appropriate size and then multiply it by GO embeddings
ones = tf.ones_like(input_1st_term[0])
embedded_go = tf.nn.embedding_lookup(embeddings, Symbol.GO)
batch_embedded_go = ones * embedded_go

input_as_list = [tf.squeeze(time_step)
                 for time_step in tf.split(0, input_sequence_size, first_term)]

decoder_inputs = [batch_embedded_go] + input_1st_term

# the END symbol is just a label; doesn't need embeddings
ones = tf.ones_like(input_as_list[0])
batch_end = ones * Symbol.END
decoder_labels = input_as_list + [batch_end]

# label_weights is just used to weight the importance of each class
label_weights = [tf.ones_like(decoder_labels[0], dtype=tf.float32)
                 for _ in decoder_labels]

with tf.variable_scope('decoder') as decoder_scope:
    raw_outputs, _ = tf.nn.seq2seq.rnn_decoder(decoder_inputs, state_1st_term, 
                                               lstm_cell)

def project_output(raw_outputs, return_softmax=False):
    """
    Multiply the raw_outputs by a weight matrix, add a bias and return the
    softmax distribution or the logits.
    
    :param return_softmax: if True, return the softmaxes. If False, return
        the logits
    """
    output_logits = [tf.nn.xw_plus_b(time_step, softmax_weights, softmax_bias)
                     for time_step in raw_outputs]
    
    if not return_softmax:
        return output_logits
    
    output_softmax = [tf.nn.softmax(time_step) for time_step in output_logits]
    return output_softmax

output_logits = project_output(raw_outputs, False)
loss = tf.nn.seq2seq.sequence_loss(output_logits, decoder_labels, label_weights)


Optimizer
---

In [142]:
learning_rate = 0.2

global_step = tf.Variable(0)

optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=0.1)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)

train_op = optimizer.apply_gradients(zip(gradients, v), 
                                     global_step=global_step)

Execution of new inputs 
---

Run the encoder for one single step.

`next_symbol` and `decoder_new_state` should be called for each step inside a loop.

In [143]:
# we use the same intermediate results of the training part until the 
# encoder hidden state
digit_step = tf.placeholder(tf.int32, [None], 'digit_step')
decoder_step_state = tf.placeholder(tf.float32, [None, lstm_cell.state_size], 
                                    'decoder_step_state')

# embed the input digits
decoder_step_input = [tf.nn.embedding_lookup(embeddings, digit_step)]

with tf.variable_scope(decoder_scope) as exec_time_decoder:
    exec_time_decoder.reuse_variables()
    decoder_step_output, decoder_new_state = tf.nn.seq2seq.rnn_decoder(decoder_step_input, 
                                                                       decoder_step_state, lstm_cell)
    
    step_logits = tf.nn.xw_plus_b(decoder_step_output[0], softmax_weights, softmax_bias)
    next_symbol = tf.argmax(step_logits, 1)
    

In [120]:
def generate_sequences(sequence_size, batch_size):
    """
    Generate a sequence of numbers as an array.
    
    All sequences must have the same size.
    
    :param sequence_size: the number of items (digits) actually
        contained in the sequences
    :param batch_size: number of sequences
    """
    dims = (sequence_size, batch_size)
    sequences = np.random.random_integers(0, 9, dims)
    
    return sequences


def extract_answer(input_sequence, sequence_size):
    """
    Create a numpy array with the decoder outputs
    """
    answer = []
    batch_size = input_sequence.shape[1]
    current_symbol = [Symbol.GO] * batch_size
    
    encoder_feeds = {first_term: input_sequence, 
                     first_term_size: sequence_size}
    hidden_state = sess.run([state_1st_term], feed_dict=encoder_feeds)[0]
    
    # this array control which sequences have already been finished by the
    # decoder, i.e., for which ones it already produced the END symbol
    sequences_done = np.zeros(batch_size, dtype=np.bool)
    while True:
        decoder_feeds = {decoder_step_state: hidden_state,
                         digit_step: current_symbol}

        fetches = sess.run([next_symbol, decoder_new_state], 
                           feed_dict=decoder_feeds)
        current_symbol, hidden_state = fetches
        
        sequences_done |= (current_symbol == Symbol.END)
                
        if sequences_done.all():
            break

        answer.append(current_symbol)
    
    return answer

In [127]:
sequence_sizes = np.array([9] * batch_size)
answer = np.array(extract_answer(random_valid, sequence_sizes))

In [135]:
random_valid.shape

(9, 64)

In [138]:
np.sum(answer == random_valid) / answer.size

0.80381944444444442

Execução
----

In [None]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
batch_size = 64

accumulated_loss = 0
report_interval = 50

for i in range(5000):
    
    # a single size for all items in the input
    sequences = generate_sequences(input_sequence_size, batch_size)
    sequence_size = np.random.random_integers(1, input_sequence_size)
    sequence_size_array = np.array([sequence_size] * batch_size)
    
    feeds = {first_term: sequences, first_term_size: sequence_size_array}

    _, loss_value = sess.run([train_op, loss], feed_dict=feeds)
    accumulated_loss += loss_value
    
    if (i + 1) % report_interval == 0:
        print('Epoch %d' % (i + 1))
        avg_loss = accumulated_loss / report_interval
        print('Loss: %.5f' % avg_loss)
        accumulated_loss = 0

#     random_valid = generate_sequences(input_sequence_size, batch_size)
#     valid_feeds = {first_term: random_valid, first_term_size: input_sequence_size}
#     np_encoded_state = sess.run([sample_encoder_state], feed_dict=test_feeds)[0]



# sess.close()

Epoch 50
Loss: 2.23064
Epoch 100
Loss: 1.90105
Epoch 150
Loss: 1.71675
Epoch 200
Loss: 1.59983
Epoch 250
Loss: 1.44190
Epoch 300

In [41]:
random_valid = generate_sequences(input_sequence_size, batch_size)

In [277]:
sequence_size

2