In [1]:
import random

In [2]:
def generate_equations(allowed_operators, dataset_size, min_value, max_value):
    """Generates pairs of equations and solutions to them.
    
       Each equation has a form of two integers with an operator in between.
       Each solution is an integer with the result of the operaion.
    
        allowed_operators: list of strings, allowed operators.
        dataset_size: an integer, number of equations to be generated.
        min_value: an integer, min value of each operand.
        max_value: an integer, max value of each operand.

        result: a list of tuples of strings (equation, solution).
    """
    sample = []
    allowed_operators[1]
    for _ in range(dataset_size):
        input_= (str(random.randint(min_value, max_value)) + allowed_operators[random.randint(0,len(allowed_operators)-1)] + str(random.randint(min_value, max_value)))
        output_= str(eval(input_))
        sample.append((input_,output_))
    return sample


def test_generate_equations():
    allowed_operators = ['+', '-']
    dataset_size = 10
    for (input_, output_) in generate_equations(allowed_operators, dataset_size, 0, 100):
        if not (type(input_) is str and type(output_) is str):
            return "Both parts should be strings."
        if eval(input_) != int(output_):
            return "The (equation: {!r}, solution: {!r}) pair is incorrect.".format(input_, output_)
    return "Tests passed."

In [3]:
print(test_generate_equations())

Tests passed.


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
allowed_operators = ['+', '-']
dataset_size = 100000
data = generate_equations(allowed_operators, dataset_size, min_value=0, max_value=9999)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [6]:
word2id = {symbol:i for i, symbol in enumerate('^$#+-1234567890')}
id2word = {i:symbol for symbol, i in word2id.items()}

In [7]:
start_symbol = '^'
end_symbol = '$'
padding_symbol = '#'

In [8]:
def sentence_to_ids(sentence, word2id, padded_len):
    """ Converts a sequence of symbols to a padded sequence of their ids.
    
      sentence: a string, input/output sequence of symbols.
      word2id: a dict, a mapping from original symbols to ids.
      padded_len: an integer, a desirable length of the sequence.

      result: a tuple of (a list of ids, an actual length of sentence).
    """
    
    sent_ids = [word2id[char] for char in sentence]
    if(len(sentence)<padded_len):
        sent_ids.append(word2id['$'])
        for x in range(padded_len-len(sentence)-1):
            sent_ids.append(word2id['#'])
        return sent_ids,len(sentence)+1    
    sent_ids[padded_len-1]=word2id['$']
    sent_len = padded_len
    
    return sent_ids, sent_len


def test_sentence_to_ids():
    sentences = [("123+123", 7), ("123+123", 8), ("123+123", 10)]
    expected_output = [([5, 6, 7, 3, 5, 6, 1], 7), 
                       ([5, 6, 7, 3, 5, 6, 7, 1], 8), 
                       ([5, 6, 7, 3, 5, 6, 7, 1, 2, 2], 8)] 
    for (sentence, padded_len), (sentence_ids, expected_length) in zip(sentences, expected_output):
        output, length = sentence_to_ids(sentence, word2id, padded_len)
        if output != sentence_ids:
            return("Convertion of '{}' for padded_len={} to {} is incorrect.".format(
                sentence, padded_len, output))
        if length != expected_length:
            return("Convertion of '{}' for padded_len={} has incorrect actual length {}.".format(
                sentence, padded_len, length))
    return("Tests passed.")

In [9]:
print(test_sentence_to_ids())

Tests passed.


In [10]:
def ids_to_sentence(ids, id2word):
    """ Converts a sequence of ids to a sequence of symbols.
    
          ids: a list, indices for the padded sequence.
          id2word:  a dict, a mapping from ids to original symbols.

          result: a list of symbols.
    """
 
    return [id2word[i] for i in ids]

In [11]:
def batch_to_ids(sentences, word2id, max_len):
    """Prepares batches of indices. 
    
       Sequences are padded to match the longest sequence in the batch,
       if it's longer than max_len, then max_len is used instead.

        sentences: a list of strings, original sequences.
        word2id: a dict, a mapping from original symbols to ids.
        max_len: an integer, max len of sequences allowed.

        result: a list of lists of ids, a list of actual lengths.
    """
    
    max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [12]:
def generate_batches(samples, batch_size=64):
    X, Y = [], []
    for i, (x, y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i % batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [13]:
sentences = train_set[0]
ids, sent_lens = batch_to_ids(sentences, word2id, max_len=10)
print('Input:', sentences)
print('Ids: {}\nSentences lengths: {}'.format(ids, sent_lens))

Input: ('4289-8224', '-3935')
Ids: [[8, 6, 12, 13, 4, 12, 6, 6, 8, 1], [4, 7, 13, 7, 9, 1, 2, 2, 2, 2]]
Sentences lengths: [10, 6]


In [14]:
import tensorflow as tf

In [15]:
class Seq2SeqModel(object):
    pass

In [41]:
def declare_placeholders(self):
    """Specifies placeholders for the model."""
    
    # Placeholders for input and its actual lengths.
    self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
    self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')
    
    # Placeholders for groundtruth and its actual lengths.
    self.ground_truth = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ground_truth')
    self.ground_truth_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='ground_truth_lengths')
        
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[])

In [42]:
Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

In [18]:
def create_embeddings(self, vocab_size, embeddings_size):
    """Specifies embeddings layer and embeds an input batch."""
     
    random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)
    self.embeddings = tf.Variable(initial_value=random_initializer,name="embeddings_random_initialized",dtype=tf.float32)
    
    # Perform embeddings lookup for self.input_batch. 
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings,self.input_batch)

In [19]:
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

In [20]:
def build_encoder(self, hidden_size):
    """Specifies encoder architecture and computes its output."""
    
    # Create GRUCell with dropout.
    encoder_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_units=hidden_size)
                                                  ,input_keep_prob=self.dropout_ph
                                                  ,dtype=tf.float32)
    
    # Create RNN with the predefined cell.
    _, self.final_encoder_state = tf.nn.dynamic_rnn(
    encoder_cell,
    self.input_batch_embedded,
    sequence_length=self.input_batch_lengths,
    dtype=tf.float32
    
)

In [21]:
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

In [22]:
def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):
    """Specifies decoder architecture and computes the output.
    
        Uses different helpers:
          - for train: feeding ground truth
          - for inference: feeding generated output

        As a result, self.train_outputs and self.infer_outputs are created. 
        Each of them contains two fields:
          rnn_output (predicted logits)
          sample_id (predictions).

    """
    
    # Use start symbols as the decoder inputs at the first time step.
    batch_size = tf.shape(self.input_batch)[0]
    start_tokens = tf.fill([batch_size], start_symbol_id)
    ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
    
    # Use the embedding layer defined before to lookup embedings for ground_truth_as_input. 
    self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings, ground_truth_as_input)
     
    # Create TrainingHelper for the train stage.
    train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded, 
                                                     self.ground_truth_lengths)
    
    # Create GreedyEmbeddingHelper for the inference stage.
    # You should provide the embedding layer, start_tokens and index of the end symbol.
    infer_helper =tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings, start_tokens, end_symbol_id)
    
  
    def decode(helper, scope, reuse=None):
        """Creates decoder and return the results of the decoding with a given helper."""
        
        with tf.variable_scope(scope, reuse=reuse):
            # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
            decoder_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(num_units=hidden_size,reuse=reuse)
                                                  ,input_keep_prob=self.dropout_ph
                                                  ,dtype=tf.float32)
            
            # Create a projection wrapper.
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)
            
            # Create BasicDecoder, pass the defined cell, a helper, and initial state.
            # The initial state should be equal to the final state of the encoder!
            decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=self.final_encoder_state)
            
            # The first returning argument of dynamic_decode contains two fields:
            #   rnn_output (predicted logits)
            #   sample_id (predictions)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter, 
                                                              output_time_major=False, impute_finished=True)

            return outputs
        
    self.train_outputs = decode(train_helper, 'decode')
    self.infer_outputs = decode(infer_helper, 'decode', reuse=True)

In [23]:
Seq2SeqModel.__build_decoder = classmethod(build_decoder)

In [24]:
def compute_loss(self):
    """Computes sequence loss (masked cross-entopy loss with logits)."""
    
    weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)
    
    self.loss = tf.contrib.seq2seq.sequence_loss(self.train_outputs.rnn_output,self.ground_truth,weights)

In [25]:
Seq2SeqModel.__compute_loss = classmethod(compute_loss)

In [44]:
def perform_optimization(self):
    """Specifies train_op that optimizes self.loss."""
    
    self.train_op =tf.contrib.layers.optimize_loss(
        loss=self.loss,
        optimizer='Adam',
        learning_rate=self.learning_rate_ph,
        clip_gradients=1.0,
        global_step=tf.train.get_global_step()
        )


In [45]:
Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)

In [28]:
def init_model(self, vocab_size, embeddings_size, hidden_size, 
               max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):
    
    self.__declare_placeholders()
    self.__create_embeddings(vocab_size, embeddings_size)
    self.__build_encoder(hidden_size)
    self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)
    
    # Compute loss and back-propagate.
    self.__compute_loss()
    self.__perform_optimization()
    
    # Get predictions for evaluation.
    self.train_predictions = self.train_outputs.sample_id
    self.infer_predictions = self.infer_outputs.sample_id

In [29]:
Seq2SeqModel.__init__ = classmethod(init_model)

In [30]:
def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
    pred, loss, _ = session.run([
            self.train_predictions,
            self.loss,
            self.train_op], feed_dict=feed_dict)
    return pred, loss

In [31]:
Seq2SeqModel.train_on_batch = classmethod(train_on_batch)

In [32]:
def predict_for_batch(self, session, X, X_seq_len):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len
    }
    pred = session.run([
            self.infer_predictions
        ], feed_dict=feed_dict)[0]
    return pred

def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len
    }
    pred, loss = session.run([
            self.infer_predictions,
            self.loss,
        ], feed_dict=feed_dict)
    return pred, loss

In [33]:
Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)
Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)

In [46]:
tf.reset_default_graph()

model = model = Seq2SeqModel(
    vocab_size = len(word2id),
    embeddings_size=20,
    max_iter=7,
    hidden_size=512,
    start_symbol_id=word2id['^'],
    end_symbol_id=word2id['$'],
    padding_symbol_id = word2id['#']
)

batch_size = 128
n_epochs = 10
learning_rate = 0.001
dropout_keep_probability = 0.5
max_len = 20

n_step = int(len(train_set) / batch_size)

In [47]:
session = tf.Session()
session.run(tf.global_variables_initializer())
            
invalid_number_prediction_counts = []
all_model_predictions = []
all_ground_truth = []

print('Start training... \n')
for epoch in range(n_epochs):  
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    print('Train: epoch', epoch + 1)
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):
        X_ids, X_sent_lens = batch_to_ids(X_batch, word2id, max_len=max_len)
        Y_ids, Y_sent_lens = batch_to_ids(Y_batch, word2id, max_len=max_len)
        # prepare the data (X_batch and Y_batch) for training
        # using function batch_to_ids
        predictions, loss = model.train_on_batch(session,X_ids, X_sent_lens,Y_ids,Y_sent_lens,learning_rate,dropout_keep_probability)
        
        if n_iter % 200 == 0:
            print("Epoch: [%d/%d], step: [%d/%d], loss: %f" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))
                
    X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))
    X, X_sent_lens = batch_to_ids(X_sent, word2id, max_len=max_len)
    Y, Y_sent_lens = batch_to_ids(Y_sent, word2id, max_len=max_len)
    # prepare test data (X_sent and Y_sent) for predicting 
    # quality and computing value of the loss function
    # using function batch_to_ids
    
    predictions, loss = model.predict_for_batch_with_loss(session,X,X_sent_lens,Y,Y_sent_lens)
    print('Test: epoch', epoch + 1, 'loss:', loss,)
    for x, y, p  in list(zip(X, Y, predictions))[:3]:
        print('X:',''.join(ids_to_sentence(x, id2word)))
        print('Y:',''.join(ids_to_sentence(y, id2word)))
        print('O:',''.join(ids_to_sentence(p, id2word)))
        print('')

    model_predictions = []
    ground_truth = []
    invalid_number_prediction_count = 0
    # For the whole test set calculate ground-truth values (as integer numbers)
    # and prediction values (also as integers) to calculate metrics.
    # If generated by model number is not correct (e.g. '1-1'), 
    # increase invalid_number_prediction_count and don't append this and corresponding
    # ground-truth value to the arrays.
    for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):
        y_sent = ''.join(ids_to_sentence(y, id2word))
        y_sent = y_sent[:y_sent.find('$')]
        p_sent = ''.join(ids_to_sentence(p, id2word))
        p_sent = p_sent[:p_sent.find('$')]
        if p_sent.isdigit() or (p_sent.startswith('-') and p_sent[1:].isdigit()):
            model_predictions.append(int(p_sent))
            ground_truth.append(int(y_sent))
        else:
            invalid_number_prediction_count += 1
    
    all_model_predictions.append(model_predictions)
    all_ground_truth.append(ground_truth)
    invalid_number_prediction_counts.append(invalid_number_prediction_count)
            
print('\n...training finished.')

Start training... 

Train: epoch 1
Epoch: [1/10], step: [1/625], loss: 2.734513
Epoch: [1/10], step: [201/625], loss: 1.795438
Epoch: [1/10], step: [401/625], loss: 1.749122
Epoch: [1/10], step: [601/625], loss: 1.678461
Test: epoch 1 loss: 1.6372898
X: 3764-5447$
Y: -1683$
O: -189$^

X: 3036-1590$
Y: 1446$#
O: 2899$^

X: 9373+8004$
Y: 17377$
O: 17700$

Train: epoch 2
Epoch: [2/10], step: [1/625], loss: 1.687203
Epoch: [2/10], step: [201/625], loss: 1.565951
Epoch: [2/10], step: [401/625], loss: 1.526545
Epoch: [2/10], step: [601/625], loss: 1.480577
Test: epoch 2 loss: 1.4597286
X: 3195+5885$
Y: 9080$#
O: 9706$^

X: 3667+6634$
Y: 10301$
O: 10166$

X: 8477+9961$
Y: 18438$
O: 17177$

Train: epoch 3
Epoch: [3/10], step: [1/625], loss: 1.491660
Epoch: [3/10], step: [201/625], loss: 1.469961
Epoch: [3/10], step: [401/625], loss: 1.468264
Epoch: [3/10], step: [601/625], loss: 1.382922
Test: epoch 3 loss: 1.348758
X: 1452-6156$
Y: -4704$
O: -4823$

X: 4056-2062$
Y: 1994$#
O: 2233$^

X: 3965+

In [48]:
from sklearn.metrics import mean_absolute_error

In [49]:
for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,
                                                                            all_model_predictions,
                                                                            invalid_number_prediction_counts), 1):
    mae = mean_absolute_error(gts, predictions)
    print("Epoch: %i, MAE: %f, Invalid numbers: %i" % (i, mae, invalid_number_prediction_count))

Epoch: 1, MAE: 323.000000, Invalid numbers: 0
Epoch: 2, MAE: 1261.000000, Invalid numbers: 0
Epoch: 3, MAE: 735.000000, Invalid numbers: 0
Epoch: 4, MAE: 359.000000, Invalid numbers: 0
Epoch: 5, MAE: 434.000000, Invalid numbers: 0
Epoch: 6, MAE: 42.000000, Invalid numbers: 0
Epoch: 7, MAE: 57.000000, Invalid numbers: 0
Epoch: 8, MAE: 9.000000, Invalid numbers: 0
Epoch: 9, MAE: 22.000000, Invalid numbers: 0
Epoch: 10, MAE: 18.000000, Invalid numbers: 0
