Numeric LSTM
=====

Essa é a implementação de uma LSTM memorizar uma representação de números de vários algarismos. Números são lidos um dígito de cada vez, de modo que a LSTM aprenda uma representação interna para quantidades. Em seguida, uma outra LSTM decodifica a célula de memória, gerando os números originais.

Funcionamento geral
----

1. LSTM encoder codifica a sequência de números.

2. LSTM decoder gera a saída, um algarismo de cada vez (é aplicado um softmax sobre a saída da rede)

Obs:

- Embeddings são compartilhadas, mas o encoder e decoder têm seus próprios parâmetros


In [1]:
import tensorflow as tf
import numpy as np

In [19]:
# a number with 8 digits most and END
output_sequence_size = 9

# digits 0-9
encoder_vocab = 10

# digits 0-9, GO and END symbols
decoder_vocab = 12

# digits 0-9, GO and END symbols
vocab_size = 12

class Symbol(object):
    """
    Placeholder class for values used in the RNNs.
    """
    END = 10
    GO = 11

Criação do grafo
---

**Código abaixo reseta o grafo**

In [271]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

In [272]:
embedding_size = 50
num_lstm_units = embedding_size

# a number with 8 digits plus at least one END symbol
input_sequence_size = 9

max_sample_size = 20

graph = tf.Graph()

# both terms are inputs to the encoder
first_term = tf.placeholder(tf.int32, [input_sequence_size, None], 'first_term')
second_term = tf.placeholder(tf.int32, [input_sequence_size, None], 'second_term')

first_term_size = tf.placeholder(tf.int32, [None], 'first_term_size')
second_term_size = tf.placeholder(tf.int32, [None], 'second_term_size')
#decoder_data = tf.placeholder(tf.int32, [output_sequence_size, None], 'results_%d' % i)

# we want to share the embeddings between encoder and decoder, but not all parameters
shape = [vocab_size, embedding_size]
embeddings = tf.Variable(tf.random_uniform(shape, -1.0, 1.0), name='embeddings')

lstm_initializer = tf.random_uniform_initializer(-0.1, 0.1)
lstm_cell = tf.nn.rnn_cell.LSTMCell(num_lstm_units, embedding_size, 
                                    initializer=lstm_initializer)

with tf.variable_scope('output_softmax') as softmax_scope:
    # softmax to map decoder raw output to digits
    shape = [num_lstm_units, vocab_size]
    softmax_weights = tf.Variable(tf.truncated_normal(shape, 0.0, 0.1))
    softmax_bias = tf.Variable(tf.zeros([vocab_size]))    

def generate_rnn_input(sequence_indices, num_time_steps):
    """
    Generate the input to the RNN from a tensor of shape [sequence_size, batch_size]
    Return a list of tensors of shape [batch_size, embedding_size]
    """
    embedded_sequence =  tf.nn.embedding_lookup(embeddings, sequence_indices)
    return [tf.squeeze(time_step, [0]) 
            for time_step in tf.split(0, num_time_steps, embedded_sequence)]
    

input_1st_term = generate_rnn_input(first_term, input_sequence_size)

with tf.variable_scope('encoder') as encoder_scope:
    _, state_1st_term = tf.nn.rnn(lstm_cell, input_1st_term, 
                                  sequence_length=first_term_size, dtype=tf.float32)
    #scope.reuse_variables()
    #_, state_2nd_term = tf.nn.rnn(lstm_cell, input_2nd_term, dtype=tf.float32)

# create a tensor of 1's with the batch size and then multiply it by the GO embedding
embedded_go = tf.nn.embedding_lookup(embeddings, Symbol.GO)
batch_go = tf.ones_like(input_1st_term[0]) * embedded_go

input_as_list = [tf.squeeze(time_step)
                 for time_step in tf.split(0, input_sequence_size, first_term)]

decoder_inputs = [batch_go] + input_1st_term[:-1]
decoder_labels = input_as_list

# label_weights is just used to weight the importance of each class
label_weights = [tf.ones_like(decoder_labels[0], dtype=tf.float32)
                 for _ in decoder_labels]

with tf.variable_scope('decoder') as decoder_scope:
    raw_outputs, _ = tf.nn.seq2seq.rnn_decoder(decoder_inputs, state_1st_term, 
                                               lstm_cell)

def project_output(raw_outputs, return_softmax=False):
    """
    Multiply the raw_outputs by a weight matrix, add a bias and return the
    softmax distribution or the logits.
    
    :param return_softmax: if True, return the softmaxes. If False, return
        the logits
    """
    output_logits = [tf.nn.xw_plus_b(time_step, softmax_weights, softmax_bias)
                     for time_step in raw_outputs]
    if not return_softmax:
        return output_logits
    
    output_softmax = [tf.nn.softmax(time_step) for time_step in output_logits]
    return output_softmax

output_logits = project_output(raw_outputs, False)
loss = tf.nn.seq2seq.sequence_loss(output_logits, decoder_labels, label_weights)

# evaluation
sample_input = tf.placeholder(tf.int32, [max_sample_size, 1], name='sample_input')
sample_size = tf.placeholder(tf.int32, 1, name='sample_size')

#embedded_sample = tf.nn.embedding_lookup(embeddings, sample_input)
#embedded_sample_list = [tf.squeeze(time_step, [0]) 
#                        for time_step in tf.split(0, max_sample_size, embedded_sample)]
sample_encoder_list_input = generate_rnn_input(sample_input, max_sample_size)

with tf.variable_scope(encoder_scope, reuse=True):
    _, sample_encoder_state = tf.nn.rnn(lstm_cell, sample_encoder_list_input, 
                                sequence_length=sample_size, dtype=tf.float32)

#sample_decoder_go = tf.ones_like(sample_encoder_input[0]) * embedded_go
sample_decoder_state = tf.placeholder(tf.float32, [1, lstm_cell.state_size], 
                                      name='sample_decoder_state')
sample_decoder_input = tf.placeholder(tf.int32, [1, 1], name='sample_decoder_input')
sample_decoder_list_input = generate_rnn_input(sample_decoder_input, 1)

with tf.variable_scope(decoder_scope, reuse=True):
    # in principle we could call the lstm_cell directly, since it's only one time step; 
    # however, its weights are defined under the scope of the rnn_decoder. 
    # So, calling the decoder is simpler
    raw_sample_outputs, new_decoder_state = tf.nn.seq2seq.rnn_decoder(sample_decoder_list_input, 
                                                                      sample_decoder_state, 
                                                                      lstm_cell)

sample_softmax = project_output(raw_sample_outputs, True)


Optimizer
---

In [273]:
learning_rate = 0.1

global_step = tf.Variable(0)

optimizer = tf.train.AdamOptimizer(learning_rate, epsilon=0.1)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.25)

train_op = optimizer.apply_gradients(zip(gradients, v), 
                                     global_step=global_step)

Execução
----

In [283]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())
batch_size = 64

for i in range(2000):
    
    sequence = np.random.random_integers(0, 9, (input_sequence_size, batch_size))
    sequence_size = np.random.random_integers(1, input_sequence_size)
    sequence[sequence_size:, :] = Symbol.END
    sequence_size = np.array([sequence_size] * batch_size)
    
    feeds = {first_term: sequence, first_term_size: sequence_size}

    _, loss_value = sess.run([train_op, loss], feed_dict=feeds)
    
    if (i + 1) % 20 == 0:
        loss_value /= sequence_size[0]
        print('Epoch %d' % (i + 1))
        print('Loss: %.5f' % loss_value)

sample_sequence = np.random.random_integers(0, 9, (20, 1))
test_feeds = {sample_input: sample_sequence, sample_size: [6]}
np_encoded_state = sess.run([sample_encoder_state], feed_dict=test_feeds)[0]

chosen_digit = [Symbol.GO]
answer = []

while len(answer) < max_sample_size:
    decoder_feeds = {sample_decoder_input: [chosen_digit],
                     sample_decoder_state: np_encoded_state}
    np_softmax, np_encoded_state = sess.run([sample_softmax[0], new_decoder_state],
                                             feed_dict=decoder_feeds)
    chosen_digit = np_softmax.argmax(1)
    
    if chosen_digit == Symbol.END:
        break
    else:
        answer.append(chosen_digit)
    

sess.close()

Epoch 20
Loss: 2.04619
Epoch 40
Loss: 1.02354
Epoch 60
Loss: 2.11340
Epoch 80
Loss: 0.52669
Epoch 100
Loss: 1.21095
Epoch 120
Loss: 2.45261
Epoch 140
Loss: 2.27681
Epoch 160
Loss: 1.69539
Epoch 180
Loss: 1.92853
Epoch 200
Loss: 1.63474
Epoch 220
Loss: 0.55691
Epoch 240
Loss: 0.52502
Epoch 260
Loss: 1.59250
Epoch 280
Loss: 0.70361
Epoch 300
Loss: 0.66664
Epoch 320
Loss: 0.61588
Epoch 340
Loss: 0.33416
Epoch 360
Loss: 0.47599
Epoch 380
Loss: 1.78752
Epoch 400
Loss: 1.75068
Epoch 420
Loss: 0.50206
Epoch 440
Loss: 0.45052
Epoch 460
Loss: 0.25359
Epoch 480
Loss: 1.02665
Epoch 500
Loss: 0.79841
Epoch 520
Loss: 0.08948
Epoch 540
Loss: 0.45765
Epoch 560
Loss: 0.43004
Epoch 580
Loss: 0.64954
Epoch 600
Loss: 0.06184
Epoch 620
Loss: 0.01829
Epoch 640
Loss: 0.23490
Epoch 660
Loss: 0.20860
Epoch 680
Loss: 0.11726
Epoch 700
Loss: 0.70820
Epoch 720
Loss: 0.09541
Epoch 740
Loss: 0.04367
Epoch 760
Loss: 0.15559
Epoch 780
Loss: 1.44031
Epoch 800
Loss: 0.03408
Epoch 820
Loss: 0.03890
Epoch 840
Loss: 1.24

In [286]:
print(sample_sequence[:, 0])

[4 3 1 6 7 5 8 1 1 1 9 3 7 6 8 1 3 0 9 6]


In [288]:
print(answer)

[array([4]), array([3]), array([1]), array([6]), array([7]), array([5])]


In [277]:
sequence_size

2