From 15b14a87c5abe5046933a3870149d2a8910c7bd4 Mon Sep 17 00:00:00 2001 From: Denny Britz Date: Sat, 10 Dec 2016 20:30:39 -0800 Subject: [PATCH] Enforce styling with YAPF --- .style.yapf | 4 + pylintrc | 6 +- seq2seq/decoders/attention.py | 35 ++++--- seq2seq/decoders/attention_decoder.py | 65 ++++++++---- seq2seq/decoders/basic_decoder.py | 37 ++++--- seq2seq/decoders/decoder_base.py | 125 ++++++++++++++--------- seq2seq/encoders/rnn_encoder.py | 27 ++--- seq2seq/graph_module.py | 13 +-- seq2seq/inputs.py | 82 +++++++++------ seq2seq/losses.py | 7 +- seq2seq/models/attention_seq2seq.py | 58 ++++++----- seq2seq/models/basic_seq2seq.py | 59 ++++++----- seq2seq/models/model_base.py | 121 ++++++++++++---------- seq2seq/scripts/generate_examples.py | 19 ++-- seq2seq/scripts/generate_toy_data.py | 59 +++++++---- seq2seq/scripts/generate_vocab.py | 48 ++++++--- seq2seq/scripts/profile.py | 31 +++--- seq2seq/test/attention_test.py | 11 +- seq2seq/test/decoder_test.py | 71 +++++++------ seq2seq/test/hparams_parser_test.py | 21 ++-- seq2seq/test/inputs_test.py | 19 ++-- seq2seq/test/losses_test.py | 11 +- seq2seq/test/models_test.py | 141 ++++++++++++++++---------- seq2seq/test/rnn_encoder_test.py | 60 ++++++----- seq2seq/test/utils.py | 2 + seq2seq/training/featurizers.py | 62 +++++++---- seq2seq/training/hooks.py | 64 +++++++----- seq2seq/training/hparams_parser.py | 1 + seq2seq/training/train.py | 63 ++++++------ seq2seq/training/utils.py | 92 ++++++++++------- 30 files changed, 850 insertions(+), 564 deletions(-) create mode 100644 .style.yapf diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 00000000..f499f526 --- /dev/null +++ b/.style.yapf @@ -0,0 +1,4 @@ +[style] +based_on_style = google +indent_width = 2 +column_limit = 80 \ No newline at end of file diff --git a/pylintrc b/pylintrc index 1632c696..3d8e3792 100644 --- a/pylintrc +++ b/pylintrc @@ -193,7 +193,7 @@ max-nested-blocks=5 [FORMAT] # Maximum number of characters on a single line. -max-line-length=100 +max-line-length=80 # Regexp for a line that is allowed to be longer than the limit. ignore-long-lines=^\s*(# )??$ @@ -216,7 +216,7 @@ max-module-lines=1000 indent-string=' ' # Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=2 +indent-after-paren=4 # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. expected-line-ending-format= @@ -238,7 +238,7 @@ notes=FIXME,XXX,TODO [SIMILARITIES] # Minimum lines number of a similarity. -min-similarity-lines=4 +min-similarity-lines=10 # Ignore comments when computing similarities. ignore-comments=yes diff --git a/seq2seq/decoders/attention.py b/seq2seq/decoders/attention.py index 909af343..7fceb02f 100644 --- a/seq2seq/decoders/attention.py +++ b/seq2seq/decoders/attention.py @@ -22,16 +22,19 @@ def _build(self, state, inputs): """Computes attention scores and outputs. Args: - state: The state based on which to calculate attention scores. In seq2seq this is typically - the current state of the decoder. A tensor of shape `[B, ...]` + state: The state based on which to calculate attention scores. + In seq2seq this is typically the current state of the decoder. + A tensor of shape `[B, ...]` inputs: The elements to compute attention *over*. In seq2seq this is - typically the sequence of encoder outputs. A tensor of shape `[B, T, input_dim]` + typically the sequence of encoder outputs. + A tensor of shape `[B, T, input_dim]` Returns: A tuple `(scores, context)`. - `scores` is vector of length `T` where each element is the normalized "score" of - the corresponding `inputs` element. - `context` is the final attention layer output corresponding to the weighted inputs. + `scores` is vector of length `T` where each element is the + normalized "score" of the corresponding `inputs` element. + `context` is the final attention layer output corresponding to + the weighted inputs. A tensor fo shape `[B, input_dim]`. """ batch_size, inputs_timesteps, _ = tf.unpack(tf.shape(inputs)) @@ -40,25 +43,31 @@ def _build(self, state, inputs): # Fully connected layers to transform both inputs and state # into a tensor with `num_units` units inputs_att = tf.contrib.layers.fully_connected( - inputs=inputs, num_outputs=self.num_units, activation_fn=None, scope="inputs_att") + inputs=inputs, + num_outputs=self.num_units, + activation_fn=None, + scope="inputs_att") state_att = tf.contrib.layers.fully_connected( - inputs=state, num_outputs=self.num_units, activation_fn=None, scope="state_att") + inputs=state, + num_outputs=self.num_units, + activation_fn=None, + scope="state_att") # Take the dot product of state for each time step in inputs # Result: A tensor of shape [B, T] inputs_att_flat = tf.reshape(inputs_att, [-1, self.num_units]) state_att_flat = tf.reshape( - tf.tile(state_att, [1, inputs_timesteps]), - [inputs_timesteps * batch_size, self.num_units]) + tf.tile(state_att, [1, inputs_timesteps]), + [inputs_timesteps * batch_size, self.num_units]) scores = tf.batch_matmul( - tf.expand_dims(inputs_att_flat, 1), - tf.expand_dims(state_att_flat, 2)) + tf.expand_dims(inputs_att_flat, 1), tf.expand_dims(state_att_flat, 2)) scores = tf.reshape(scores, [batch_size, inputs_timesteps], name="scores") # Normalize the scores scores_normalized = tf.nn.softmax(scores, name="scores_normalized") - # Calculate the weighted average of the attention inputs according to the scores + # Calculate the weighted average of the attention inputs + # according to the scores context = tf.expand_dims(scores_normalized, 2) * inputs context = tf.reduce_sum(context, 1, name="context") context.set_shape([None, inputs_dim]) diff --git a/seq2seq/decoders/attention_decoder.py b/seq2seq/decoders/attention_decoder.py index 532d40b7..46968708 100644 --- a/seq2seq/decoders/attention_decoder.py +++ b/seq2seq/decoders/attention_decoder.py @@ -6,28 +6,41 @@ import tensorflow as tf from seq2seq.decoders import DecoderBase, DecoderOutput, DecoderStepOutput + class AttentionDecoderOutput( namedtuple("DecoderOutput", ["logits", "predictions", "attention_scores"])): """Augmented decoder output that also includes the attention scores. """ pass + class AttentionDecoder(DecoderBase): """An RNN Decoder that uses attention over an input sequence. Args: cell: An instance of ` tf.nn.rnn_cell.RNNCell` - vocab_size: Output vocabulary size, i.e. number of units in the softmax layer - attention_inputs: The sequence to take attentio over. A tensor of shaoe `[B, T, ...]`. - attention_fn: The attention function to use. This function map from `(state, inputs)` to - `(attention_scores, attention_context)`. + vocab_size: Output vocabulary size, i.e. number of units + in the softmax layer + attention_inputs: The sequence to take attentio over. + A tensor of shaoe `[B, T, ...]`. + attention_fn: The attention function to use. This function map from + `(state, inputs)` to `(attention_scores, attention_context)`. For an example, see `seq2seq.decoder.attention.AttentionLayer`. - max_decode_length: Maximum length for decoding steps for each example of shape `[B]`. - prediction_fn: Optional. A function that generates a predictions of shape `[B]` from a logits - of shape `[B, vocab_size]`. By default, this is argmax. + max_decode_length: Maximum length for decoding steps + for each example of shape `[B]`. + prediction_fn: Optional. A function that generates a predictions + of shape `[B]` from a logits of shape `[B, vocab_size]`. + By default, this is argmax. """ - def __init__(self, cell, vocab_size, attention_inputs, attention_fn, max_decode_length, - prediction_fn=None, name="attention_decoder"): + + def __init__(self, + cell, + vocab_size, + attention_inputs, + attention_fn, + max_decode_length, + prediction_fn=None, + name="attention_decoder"): super(AttentionDecoder, self).__init__(cell, max_decode_length, name) self.vocab_size = vocab_size self.prediction_fn = prediction_fn @@ -40,7 +53,8 @@ def __init__(self, cell, vocab_size, attention_inputs, attention_fn, max_decode_ @staticmethod def _pack_outputs(outputs_ta, final_loop_state): - logits, predictions = DecoderBase._pack_outputs(outputs_ta, final_loop_state) + logits, predictions = DecoderBase._pack_outputs(outputs_ta, + final_loop_state) attention_scores = tf.transpose(final_loop_state.pack(), [1, 0, 2]) return AttentionDecoderOutput(logits, predictions, attention_scores) @@ -48,12 +62,15 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn): initial_call = (cell_output is None) if initial_call: - cell_output = tf.zeros([tf.shape(self.attention_inputs)[0], self.cell.output_size]) + cell_output = tf.zeros( + [tf.shape(self.attention_inputs)[0], self.cell.output_size]) # Initialize the TensorArray that will hold the attention scores - next_loop_state = tf.TensorArray(dtype=tf.float32, size=1, dynamic_size=True) + next_loop_state = tf.TensorArray( + dtype=tf.float32, size=1, dynamic_size=True) # Compute attention - att_scores, attention_context = self.attention_fn(cell_output, self.attention_inputs) + att_scores, attention_context = self.attention_fn(cell_output, + self.attention_inputs) # In the first step the attention vector is set to all zeros if initial_call: @@ -64,22 +81,26 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn): # Softmax computation softmax_input = tf.concat(1, [cell_output, attention_context]) logits = tf.contrib.layers.fully_connected( - inputs=softmax_input, num_outputs=self.vocab_size, activation_fn=None, scope="logits") + inputs=softmax_input, + num_outputs=self.vocab_size, + activation_fn=None, + scope="logits") predictions = self.prediction_fn(logits) outputs = DecoderOutput(logits, predictions) if initial_call: outputs = DecoderOutput( - logits=tf.zeros([self.vocab_size]), - predictions=tf.zeros([], dtype=tf.int64)) + logits=tf.zeros([self.vocab_size]), + predictions=tf.zeros( + [], dtype=tf.int64)) # Append the attention context to the inputs - next_input = next_input_fn( - time_, (None if initial_call else cell_output), cell_state, loop_state, outputs) + next_input = next_input_fn(time_, (None if initial_call else cell_output), + cell_state, loop_state, outputs) next_input = tf.concat(1, [next_input, attention_context]) return DecoderStepOutput( - outputs=outputs, - next_input=next_input, - next_cell_state=cell_state, - next_loop_state=next_loop_state) + outputs=outputs, + next_input=next_input, + next_cell_state=cell_state, + next_loop_state=next_loop_state) diff --git a/seq2seq/decoders/basic_decoder.py b/seq2seq/decoders/basic_decoder.py index 5df70d22..6eeaff92 100644 --- a/seq2seq/decoders/basic_decoder.py +++ b/seq2seq/decoders/basic_decoder.py @@ -5,17 +5,27 @@ import tensorflow as tf from seq2seq.decoders import DecoderBase, DecoderOutput, DecoderStepOutput + class BasicDecoder(DecoderBase): - """A simple RNN decoder that performed a softmax operations on the cell output. + """Simple RNN decoder that performed a softmax operations on the cell output. Args: cell: An instance of ` tf.nn.rnn_cell.RNNCell` - vocab_size: Output vocabulary size, i.e. number of units in the softmax layer - max_decode_length: Maximum length for decoding steps for each example of shape `[B]`. - prediction_fn: Optional. A function that generates a predictions of shape `[B]` from a logits - of shape `[B, vocab_size]`. By default, this is argmax. + vocab_size: Output vocabulary size, i.e. number of units + in the softmax layer + max_decode_length: Maximum length for decoding steps for each example + of shape `[B]`. + prediction_fn: Optional. A function that generates a predictions + of shape `[B]` from a logits of shape `[B, vocab_size]`. + By default, this is argmax. """ - def __init__(self, cell, vocab_size, max_decode_length, prediction_fn=None, name="basic_decoder"): + + def __init__(self, + cell, + vocab_size, + max_decode_length, + prediction_fn=None, + name="basic_decoder"): super(BasicDecoder, self).__init__(cell, max_decode_length, name) self.vocab_size = vocab_size self.prediction_fn = prediction_fn @@ -31,12 +41,13 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn): cell_output = tf.zeros([1, self.cell.output_size]) logits = tf.contrib.layers.fully_connected( - inputs=cell_output, num_outputs=self.vocab_size, activation_fn=None) + inputs=cell_output, num_outputs=self.vocab_size, activation_fn=None) if initial_call: outputs = DecoderOutput( - logits=tf.zeros([self.vocab_size]), - predictions=tf.zeros([], dtype=tf.int64)) + logits=tf.zeros([self.vocab_size]), + predictions=tf.zeros( + [], dtype=tf.int64)) else: predictions = self.prediction_fn(logits) outputs = DecoderOutput(logits, predictions) @@ -44,7 +55,7 @@ def _step(self, time_, cell_output, cell_state, loop_state, next_input_fn): next_input = next_input_fn(time_, (None if initial_call else cell_output), cell_state, loop_state, outputs) return DecoderStepOutput( - outputs=outputs, - next_input=next_input, - next_cell_state=cell_state, - next_loop_state=None) + outputs=outputs, + next_input=next_input, + next_cell_state=cell_state, + next_loop_state=None) diff --git a/seq2seq/decoders/decoder_base.py b/seq2seq/decoders/decoder_base.py index 2a0ef5fa..67594462 100644 --- a/seq2seq/decoders/decoder_base.py +++ b/seq2seq/decoders/decoder_base.py @@ -11,14 +11,17 @@ class DecoderOutput(namedtuple("DecoderOutput", ["logits", "predictions"])): """Output of a decoder. - Note that we output both the logits and predictions because during dynamic decoding - the predictions may not correspond to max(logits). For example, we may be sampling from the - logits instead. + Note that we output both the logits and predictions because during + dynamic decoding the predictions may not correspond to max(logits). + For example, we may be sampling from the logits instead. """ pass -class DecoderStepOutput(namedtuple( - "DecoderStepOutput", ["outputs", "next_cell_state", "next_input", "next_loop_state"])): + +class DecoderStepOutput( + namedtuple( + "DecoderStepOutput", + ["outputs", "next_cell_state", "next_input", "next_loop_state"])): """Output of a decoder step to be used with Tensorflow's `raw_rnn`. """ @@ -27,7 +30,13 @@ class RNNStep(GraphModule): """ A Wrapper around `raw_rnn`. """ - def __init__(self, step_fn, input_fn, initial_state, sequence_length, name="rnn_step"): + + def __init__(self, + step_fn, + input_fn, + initial_state, + sequence_length, + name="rnn_step"): super(RNNStep, self).__init__(name) self.step_fn = step_fn self.input_fn = input_fn @@ -38,7 +47,8 @@ def _build(self, time_, cell_output, cell_state, loop_state): if cell_output is None: cell_state = self.initial_state - step_output = self.step_fn(time_, cell_output, cell_state, loop_state, self.input_fn) + step_output = self.step_fn(time_, cell_output, cell_state, loop_state, + self.input_fn) assert isinstance(step_output, DecoderStepOutput), \ "Step output must be an isntance of DecoderStepOutput" @@ -47,19 +57,24 @@ def _build(self, time_, cell_output, cell_state, loop_state): else: elements_finished = (time_ >= self.sequence_length) - return (elements_finished, step_output.next_input, step_output.next_cell_state, - step_output.outputs, step_output.next_loop_state) + return (elements_finished, step_output.next_input, + step_output.next_cell_state, step_output.outputs, + step_output.next_loop_state) class FixedDecoderInputs(GraphModule): - """An operation that feeds fixed inputs to a decoder, also known as "teacher forcing". + """An operation that feeds fixed inputs to a decoder, + also known as "teacher forcing". Args: - inputs: The inputs to feed to the decoder. A tensor of shape `[B, T, ...]`. At each time - step T, one slice of shape `[B, ...]` is fed to the decoder. - sequence_length: A tensor of shape `[B]` that specifies the sequence length for each example. + inputs: The inputs to feed to the decoder. + A tensor of shape `[B, T, ...]`. At each time step T, one slice + of shape `[B, ...]` is fed to the decoder. + sequence_length: A tensor of shape `[B]` that specifies the + sequence length for each example. """ + def __init__(self, inputs, sequence_length, name="fixed_decoder_inputs"): super(FixedDecoderInputs, self).__init__(name) self.inputs = inputs @@ -67,13 +82,15 @@ def __init__(self, inputs, sequence_length, name="fixed_decoder_inputs"): with self.variable_scope(): self.inputs_ta = tf.TensorArray( - dtype=self.inputs.dtype, size=tf.shape(self.inputs)[1], name="inputs_ta") - self.inputs_ta = self.inputs_ta.unpack(tf.transpose(self.inputs, [1, 0, 2])) + dtype=self.inputs.dtype, + size=tf.shape(self.inputs)[1], + name="inputs_ta") + self.inputs_ta = self.inputs_ta.unpack( + tf.transpose(self.inputs, [1, 0, 2])) self.max_seq_len = tf.reduce_max(sequence_length, name="max_seq_len") self.batch_size = tf.identity(tf.shape(inputs)[0], name="batch_size") self.input_dim = tf.identity(tf.shape(inputs)[-1], name="input_dim") - def _build(self, time_, *args): """Returns the input for the given time step. @@ -81,29 +98,32 @@ def _build(self, time_, *args): time_: An int32 scalar Returns: - A tensor of shape `[B, ...]`. When `time_` is past the maximum sequence length - a zero tensor is fed as input for performance purposes. + A tensor of shape `[B, ...]`. When `time_` is past the maximum + sequence length a zero tensor is fed as input for performance purposes. """ all_finished = (time_ >= self.max_seq_len) next_input = tf.cond( - all_finished, - lambda: tf.zeros([self.batch_size, self.input_dim], dtype=tf.float32), - lambda: self.inputs_ta.read(time_)) + all_finished, + lambda: tf.zeros([self.batch_size, self.input_dim], dtype=tf.float32), + lambda: self.inputs_ta.read(time_)) next_input.set_shape([None, self.inputs.get_shape().as_list()[-1]]) return next_input class DynamicDecoderInputs(GraphModule): - """An operation that feeds dynamic inputs to a decoder according to some arbitrary - function that creates a new input from the decoder output at the current step, e.g. - `embed(argmax(logits))`. + """An operation that feeds dynamic inputs to a decoder according to some + arbitrary function that creates a new input from the decoder output at + the current step, e.g. `embed(argmax(logits))`. Args: - initial_inputs: An input to feed at the first time step. A tensor of shape `[B, ...]`. - make_input_fn: A function that mapes from `(decoder_output) -> next_input`, where - `next_input` must be a Tensor of shape `[B, ...]`. + initial_inputs: An input to feed at the first time step. + A tensor of shape `[B, ...]`. + make_input_fn: A function that mapes from `(decoder_output) -> next_input`, + where `next_input` must be a Tensor of shape `[B, ...]`. """ - def __init__(self, initial_inputs, make_input_fn, name="fixed_decoder_inputs"): + + def __init__(self, initial_inputs, make_input_fn, + name="fixed_decoder_inputs"): super(DynamicDecoderInputs, self).__init__(name) self.initial_inputs = initial_inputs self.make_input_fn = make_input_fn @@ -117,7 +137,6 @@ def _build(self, _time_, cell_output, _cell_state, _loop_state, step_output): return next_input - class DecoderBase(GraphModule): """Base class for RNN decoders. @@ -125,6 +144,7 @@ class DecoderBase(GraphModule): cell: An instance of ` tf.nn.rnn_cell.RNNCell` name: A name for this module """ + def __init__(self, cell, max_decode_length, name): super(DecoderBase, self).__init__(name) self.cell = cell @@ -132,9 +152,9 @@ def __init__(self, cell, max_decode_length, name): def _step(self, time, cell_output, cell_state, loop_state, next_input_fn): """ - This function maps from the decoder state to the outputs of the current time step - and the state of the next step. This is where the actual decoding logic should be implemented - by subclasses. + This function maps from the decoder state to the outputs of the current + time step and the state of the next step. This is where the actual decoding + logic should be implemented by subclasses. The arguments to this function follow those of `tf.nn.raw_rnn`. Refer to its documentation for further explanation. @@ -146,19 +166,22 @@ def _step(self, time, cell_output, cell_state, loop_state, next_input_fn): cell_state: The state result of applying the cell function to the input. A tensor of shape `[B, cell.state_size]`. This may also be a tuple depending on which type of cell is being used. - loop_state: An optional tuple that can be used to pass state through time steps. - The shape of this is defined by the subclass. - next_input_fn: A function that generates the next input, e.g. an instance of - `FixedDecoderInputs` or `DynamicDecoderInputs`. + loop_state: An optional tuple that can be used to pass state through + time steps. The shape of this is defined by the subclass. + next_input_fn: A function that generates the next input, e.g. an + instance of `FixedDecoderInputs` or `DynamicDecoderInputs`. Returns: A `DecoderStepOutput` tuple, where: - outputs: The RNN output at this time step. A tuple with logits and predictions - next_cell_state: The cell state for the next iteration. In most cases this is - simply the passed in `cell_state`. A tensor of shape `[B, cell.state_size]`. - next_input: The input to the next time step. A tensor of shape `[B, ...]` - next_loop_state: A new loop state of the same type/shape as the passed in `loop_state`. + outputs: The RNN output at this time step. A tuple. + next_cell_state: The cell state for the next iteration. In most cases + this is simply the passed in `cell_state`. + A tensor of shape `[B, cell.state_size]`. + next_input: The input to the next time step. + A tensor of shape `[B, ...]` + next_loop_state: A new loop state of the same type/shape + as the passed in `loop_state`. """ raise NotImplementedError @@ -167,19 +190,21 @@ def _pack_outputs(outputs_ta, _final_loop_state): """Transposes outputs from time-major to batch-major. """ logits = tf.transpose(outputs_ta.logits.pack(), [1, 0, 2], name="logits") - predictions = tf.transpose(outputs_ta.predictions.pack(), [1, 0], name="predictions") + predictions = tf.transpose( + outputs_ta.predictions.pack(), [1, 0], name="predictions") return DecoderOutput(logits=logits, predictions=predictions) - def _build(self, input_fn, initial_state, sequence_length): if sequence_length is None: sequence_length = self.max_decode_length rnn_loop_fn = RNNStep( - step_fn=self._step, - input_fn=input_fn, - initial_state=initial_state, - sequence_length=tf.minimum(sequence_length, self.max_decode_length)) - - outputs_ta, final_state, final_loop_state = tf.nn.raw_rnn(self.cell, rnn_loop_fn) - return self._pack_outputs(outputs_ta, final_loop_state), final_state, final_loop_state + step_fn=self._step, + input_fn=input_fn, + initial_state=initial_state, + sequence_length=tf.minimum(sequence_length, self.max_decode_length)) + + outputs_ta, final_state, final_loop_state = tf.nn.raw_rnn(self.cell, + rnn_loop_fn) + return self._pack_outputs(outputs_ta, + final_loop_state), final_state, final_loop_state diff --git a/seq2seq/encoders/rnn_encoder.py b/seq2seq/encoders/rnn_encoder.py index 772a9501..9c3be8f7 100644 --- a/seq2seq/encoders/rnn_encoder.py +++ b/seq2seq/encoders/rnn_encoder.py @@ -6,7 +6,9 @@ import tensorflow as tf from seq2seq import GraphModule -RNNEncoderOutput = collections.namedtuple("RNNEncoderOutput", ["outputs", "final_state"]) +RNNEncoderOutput = collections.namedtuple("RNNEncoderOutput", + ["outputs", "final_state"]) + class UnidirectionalRNNEncoder(GraphModule): """ @@ -24,11 +26,11 @@ def __init__(self, cell, name="forward_rnn_encoder"): def _build(self, inputs, sequence_length, **kwargs): outputs, state = tf.nn.dynamic_rnn( - cell=self.cell, - inputs=inputs, - sequence_length=sequence_length, - dtype=tf.float32, - **kwargs) + cell=self.cell, + inputs=inputs, + sequence_length=sequence_length, + dtype=tf.float32, + **kwargs) return RNNEncoderOutput(outputs=outputs, final_state=state) @@ -42,18 +44,19 @@ class BidirectionalRNNEncoder(GraphModule): cell: An instance of tf.nn.rnn_cell.RNNCell name: A name for the encoder """ + def __init__(self, cell, name="bidi_rnn_encoder"): super(BidirectionalRNNEncoder, self).__init__(name) self.cell = cell def _build(self, inputs, sequence_length, **kwargs): outputs, states = tf.nn.bidirectional_dynamic_rnn( - cell_fw=self.cell, - cell_bw=self.cell, - inputs=inputs, - sequence_length=sequence_length, - dtype=tf.float32, - **kwargs) + cell_fw=self.cell, + cell_bw=self.cell, + inputs=inputs, + sequence_length=sequence_length, + dtype=tf.float32, + **kwargs) # Concatenate outputs and states of the forward and backward RNNs outputs_concat = tf.concat(2, outputs) diff --git a/seq2seq/graph_module.py b/seq2seq/graph_module.py index a32f2510..62f27a8c 100644 --- a/seq2seq/graph_module.py +++ b/seq2seq/graph_module.py @@ -5,14 +5,15 @@ import tensorflow as tf + class GraphModule(object): """ - A convenience base class that makes it easy to share and access variables in the graph. - Each insance of this class creates its own set of variables, but each subsequent execution - of an instance will re-use its variables. + C onvenience class that makes it easy to share variables. + Each insance of this class creates its own set of variables, but + each subsequent execution of an instance will re-use its variables. - Graph components that define variables should inherit from this class and implement their - logic in the `_build` method. + Graph components that define variables should inherit from this class + and implement their logic in the `_build` method. """ def __init__(self, name): @@ -23,7 +24,7 @@ def __init__(self, name): name: Name of this module. Used for `tf.make_template`. """ self._template = tf.make_template(name, self._build, create_scope_now_=True) - # Docstrings for the class should be equal to the docstring for the _build method + # Docstrings for the class should be the docstring for the _build method self.__doc__ = self._build.__doc__ # pylint: disable=E1101 self.__call__.__func__.__doc__ = self._build.__doc__ diff --git a/seq2seq/inputs.py b/seq2seq/inputs.py index 78c43ed8..772399fe 100644 --- a/seq2seq/inputs.py +++ b/seq2seq/inputs.py @@ -5,13 +5,16 @@ import collections import tensorflow as tf -SpecialVocab = collections.namedtuple( - "SpecialVocab", - ["OOV", "SEQUENCE_START", "SEQUENCE_END"]) +SpecialVocab = collections.namedtuple("SpecialVocab", + ["OOV", "SEQUENCE_START", "SEQUENCE_END"]) -class VocabInfo(collections.namedtuple("VocbabInfo", ["path", "vocab_size", "special_vocab"])): + +class VocabInfo( + collections.namedtuple("VocbabInfo", + ["path", "vocab_size", "special_vocab"])): """Convenience structure for vocabulary information. """ + @property def total_size(self): """Returns size the the base vocabulary plus the size of extra vocabulary""" @@ -33,6 +36,7 @@ def get_vocab_info(vocab_path): special_vocab = get_special_vocab(vocab_size) return VocabInfo(vocab_path, vocab_size, special_vocab) + def get_special_vocab(vocabulary_size): """Returns the `SpecialVocab` instance for a given vocabulary size. """ @@ -65,23 +69,28 @@ def create_vocabulary_lookup_table(filename, default_value=None, name=None): tf.logging.info("Creating vocabulary lookup table of size %d", vocab_size) table_init = tf.contrib.lookup.TextFileIdTableInitializer( - filename, vocab_size=vocab_size) + filename, vocab_size=vocab_size) reverse_table_init = tf.contrib.lookup.TextFileInitializer( - filename=filename, - key_dtype=tf.int64, - key_index=tf.contrib.lookup.TextFileIndex.LINE_NUMBER, - value_dtype=tf.string, - value_index=tf.contrib.lookup.TextFileIndex.WHOLE_LINE, - vocab_size=vocab_size) - - vocab_to_id_table = tf.contrib.lookup.HashTable(table_init, default_value, name=name) - id_to_vocab_table = tf.contrib.lookup.HashTable(reverse_table_init, "UNK", name=name) + filename=filename, + key_dtype=tf.int64, + key_index=tf.contrib.lookup.TextFileIndex.LINE_NUMBER, + value_dtype=tf.string, + value_index=tf.contrib.lookup.TextFileIndex.WHOLE_LINE, + vocab_size=vocab_size) + + vocab_to_id_table = tf.contrib.lookup.HashTable( + table_init, default_value, name=name) + id_to_vocab_table = tf.contrib.lookup.HashTable( + reverse_table_init, "UNK", name=name) return vocab_to_id_table, id_to_vocab_table, vocab_size -def make_data_provider(data_sources, reader=tf.TFRecordReader, num_samples=None, **kwargs): +def make_data_provider(data_sources, + reader=tf.TFRecordReader, + num_samples=None, + **kwargs): """ Creates a TF Slim DatasetDataProvider for a list of input files. @@ -96,32 +105,38 @@ def make_data_provider(data_sources, reader=tf.TFRecordReader, num_samples=None, """ keys_to_features = { - "pair_id": tf.FixedLenFeature([], dtype=tf.string), - "source_len": tf.FixedLenFeature([], dtype=tf.int64), - "target_len": tf.FixedLenFeature([], dtype=tf.int64), - "source_tokens": tf.VarLenFeature(tf.string), - "target_tokens": tf.VarLenFeature(tf.string) + "pair_id": tf.FixedLenFeature( + [], dtype=tf.string), + "source_len": tf.FixedLenFeature( + [], dtype=tf.int64), + "target_len": tf.FixedLenFeature( + [], dtype=tf.int64), + "source_tokens": tf.VarLenFeature(tf.string), + "target_tokens": tf.VarLenFeature(tf.string) } items_to_handlers = { - "pair_id": tf.contrib.slim.tfexample_decoder.Tensor("pair_id"), - "source_len": tf.contrib.slim.tfexample_decoder.Tensor("source_len"), - "target_len": tf.contrib.slim.tfexample_decoder.Tensor("target_len"), - "source_tokens": tf.contrib.slim.tfexample_decoder.Tensor("source_tokens", default_value=""), - "target_tokens": tf.contrib.slim.tfexample_decoder.Tensor("target_tokens", default_value="") + "pair_id": tf.contrib.slim.tfexample_decoder.Tensor("pair_id"), + "source_len": tf.contrib.slim.tfexample_decoder.Tensor("source_len"), + "target_len": tf.contrib.slim.tfexample_decoder.Tensor("target_len"), + "source_tokens": tf.contrib.slim.tfexample_decoder.Tensor( + "source_tokens", default_value=""), + "target_tokens": tf.contrib.slim.tfexample_decoder.Tensor( + "target_tokens", default_value="") } decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( - keys_to_features, items_to_handlers) + keys_to_features, items_to_handlers) dataset = tf.contrib.slim.dataset.Dataset( - data_sources=data_sources, - reader=reader, - decoder=decoder, - num_samples=num_samples, - items_to_descriptions={}) + data_sources=data_sources, + reader=reader, + decoder=decoder, + num_samples=num_samples, + items_to_descriptions={}) - return tf.contrib.slim.dataset_data_provider.DatasetDataProvider(dataset, **kwargs) + return tf.contrib.slim.dataset_data_provider.DatasetDataProvider(dataset, + **kwargs) def read_from_data_provider(data_provider): @@ -131,7 +146,8 @@ def read_from_data_provider(data_provider): data_provider: A DataProvider instance Returns: - A dictionary of tensors corresponding to all features defined by the DataProvider + A dictionary of tensors corresponding to all features + defined by the DataProvider """ item_values = data_provider.get(list(data_provider.list_items())) items_dict = dict(zip(data_provider.list_items(), item_values)) diff --git a/seq2seq/losses.py b/seq2seq/losses.py index 3a9af36b..60844f19 100644 --- a/seq2seq/losses.py +++ b/seq2seq/losses.py @@ -3,6 +3,7 @@ import tensorflow as tf + def cross_entropy_sequence_loss(logits, targets, sequence_length): """Calculates the per-example Ccross-entropy loss for a sequence of logits and masks out all losses passed the sequence length. @@ -10,7 +11,8 @@ def cross_entropy_sequence_loss(logits, targets, sequence_length): Args: logits: Logits of shape `[B, T, vocab_size]` targets: Target classes of shape `[B, T]` - sequence_length: An int32 tensor of shape `[B]` corresponding to the length of each input + sequence_length: An int32 tensor of shape `[B]` corresponding + to the length of each input Returns: A tensor of shape [B, T] that contains the loss per example, per time step. @@ -19,7 +21,8 @@ def cross_entropy_sequence_loss(logits, targets, sequence_length): losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets) # Mask out the losses we don't care about - loss_mask = tf.sequence_mask(tf.to_int32(sequence_length), tf.to_int32(tf.shape(targets)[1])) + loss_mask = tf.sequence_mask( + tf.to_int32(sequence_length), tf.to_int32(tf.shape(targets)[1])) losses = losses * tf.to_float(loss_mask) return losses diff --git a/seq2seq/models/attention_seq2seq.py b/seq2seq/models/attention_seq2seq.py index d03cfa9a..64bc03af 100644 --- a/seq2seq/models/attention_seq2seq.py +++ b/seq2seq/models/attention_seq2seq.py @@ -9,52 +9,62 @@ from seq2seq.training import utils as training_utils from seq2seq.models.model_base import Seq2SeqBase + class AttentionSeq2Seq(Seq2SeqBase): """Sequence2Sequence model with attention mechanism. Args: - source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the source vocabulary - target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the target vocabulary + source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` + for the source vocabulary + target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` + for the target vocabulary params: A dictionary of hyperparameters """ - def __init__(self, source_vocab_info, target_vocab_info, params, name="att_seq2seq"): - super(AttentionSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, params, name) + def __init__(self, + source_vocab_info, + target_vocab_info, + params, + name="att_seq2seq"): + super(AttentionSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, + params, name) @staticmethod def default_params(): params = Seq2SeqBase.default_params().copy() params.update({ - "attention.dim": 128, - "rnn_cell.type": "LSTMCell", - "rnn_cell.num_units": 128, - "rnn_cell.dropout_input_keep_prob": 1.0, - "rnn_cell.dropout_output_keep_prob": 1.0, - "rnn_cell.num_layers": 1 + "attention.dim": 128, + "rnn_cell.type": "LSTMCell", + "rnn_cell.num_units": 128, + "rnn_cell.dropout_input_keep_prob": 1.0, + "rnn_cell.dropout_output_keep_prob": 1.0, + "rnn_cell.num_layers": 1 }) return params - def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None): + def encode_decode(self, source, source_len, decoder_input_fn, target_len): encoder_cell = training_utils.get_rnn_cell( - cell_type=self.params["rnn_cell.type"], - num_units=self.params["rnn_cell.num_units"], - num_layers=self.params["rnn_cell.num_layers"], - dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"], - dropout_output_keep_prob=self.params["rnn_cell.dropout_output_keep_prob"]) + cell_type=self.params["rnn_cell.type"], + num_units=self.params["rnn_cell.num_units"], + num_layers=self.params["rnn_cell.num_layers"], + dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"], + dropout_output_keep_prob=self.params[ + "rnn_cell.dropout_output_keep_prob"]) encoder_fn = encoders.BidirectionalRNNEncoder(encoder_cell) encoder_output = encoder_fn(source, source_len) decoder_cell = encoder_cell decoder_fn = decoders.AttentionDecoder( - cell=decoder_cell, - vocab_size=self.target_vocab_info.total_size, - attention_inputs=encoder_output.outputs, - attention_fn=decoders.AttentionLayer(self.params["attention.dim"]), - max_decode_length=self.params["target.max_seq_len"]) + cell=decoder_cell, + vocab_size=self.target_vocab_info.total_size, + attention_inputs=encoder_output.outputs, + attention_fn=decoders.AttentionLayer(self.params["attention.dim"]), + max_decode_length=self.params["target.max_seq_len"]) decoder_output, _, _ = decoder_fn( - input_fn=decoder_input_fn, - initial_state=decoder_cell.zero_state(tf.shape(source_len)[0], dtype=tf.float32), - sequence_length=target_len) + input_fn=decoder_input_fn, + initial_state=decoder_cell.zero_state( + tf.shape(source_len)[0], dtype=tf.float32), + sequence_length=target_len) return decoder_output diff --git a/seq2seq/models/basic_seq2seq.py b/seq2seq/models/basic_seq2seq.py index c549ec50..a030e2a6 100644 --- a/seq2seq/models/basic_seq2seq.py +++ b/seq2seq/models/basic_seq2seq.py @@ -7,53 +7,64 @@ import seq2seq.decoders as decoders from seq2seq.models.model_base import Seq2SeqBase + class BasicSeq2Seq(Seq2SeqBase): - """Basic Sequence2Sequence model with a unidirectional encoder and decoder. The last encoder - state is used to initialize the decoder and thus both must share the same type of RNN cell. + """Basic Sequence2Sequence model with a unidirectional encoder and decoder. + The last encoder state is used to initialize the decoder and thus both + must share the same type of RNN cell. Args: - source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the source vocabulary - target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` for the target vocabulary + source_vocab_info: An instance of `seq2seq.inputs.VocabInfo` + for the source vocabulary + target_vocab_info: An instance of `seq2seq.inputs.VocabInfo` + for the target vocabulary params: A dictionary of hyperparameters """ - def __init__(self, source_vocab_info, target_vocab_info, params, name="basic_seq2seq"): - super(BasicSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, params, name) + def __init__(self, + source_vocab_info, + target_vocab_info, + params, + name="basic_seq2seq"): + super(BasicSeq2Seq, self).__init__(source_vocab_info, target_vocab_info, + params, name) @staticmethod def default_params(): params = Seq2SeqBase.default_params().copy() params.update({ - "rnn_cell.type": "LSTMCell", - "rnn_cell.num_units": 128, - "rnn_cell.dropout_input_keep_prob": 1.0, - "rnn_cell.dropout_output_keep_prob": 1.0, - "rnn_cell.num_layers": 1 + "rnn_cell.type": "LSTMCell", + "rnn_cell.num_units": 128, + "rnn_cell.dropout_input_keep_prob": 1.0, + "rnn_cell.dropout_output_keep_prob": 1.0, + "rnn_cell.num_layers": 1 }) return params - def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None): + def encode_decode(self, source, source_len, decoder_input_fn, target_len): # Create Encoder encoder_cell = training.utils.get_rnn_cell( - cell_type=self.params["rnn_cell.type"], - num_units=self.params["rnn_cell.num_units"], - num_layers=self.params["rnn_cell.num_layers"], - dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"], - dropout_output_keep_prob=self.params["rnn_cell.dropout_output_keep_prob"]) + cell_type=self.params["rnn_cell.type"], + num_units=self.params["rnn_cell.num_units"], + num_layers=self.params["rnn_cell.num_layers"], + dropout_input_keep_prob=self.params["rnn_cell.dropout_input_keep_prob"], + dropout_output_keep_prob=self.params[ + "rnn_cell.dropout_output_keep_prob"]) encoder_fn = encoders.UnidirectionalRNNEncoder(encoder_cell) encoder_output = encoder_fn(source, source_len) # Create Decoder - # Because we pass the state between encoder and decoder we must use the same cell + # Because we pass the state between encoder and decoder we must + # use the same cell type decoder_cell = encoder_cell decoder_fn = decoders.BasicDecoder( - cell=decoder_cell, - vocab_size=self.target_vocab_info.total_size, - max_decode_length=self.params["target.max_seq_len"]) + cell=decoder_cell, + vocab_size=self.target_vocab_info.total_size, + max_decode_length=self.params["target.max_seq_len"]) decoder_output, _, _ = decoder_fn( - input_fn=decoder_input_fn, - initial_state=encoder_output.final_state, - sequence_length=target_len) + input_fn=decoder_input_fn, + initial_state=encoder_output.final_state, + sequence_length=target_len) return decoder_output diff --git a/seq2seq/models/model_base.py b/seq2seq/models/model_base.py index 970ab2cb..8dc1a87d 100644 --- a/seq2seq/models/model_base.py +++ b/seq2seq/models/model_base.py @@ -6,6 +6,7 @@ from seq2seq import losses as seq2seq_losses from seq2seq.training import featurizers + class ModelBase(object): """Abstract base class for models. @@ -13,6 +14,7 @@ class ModelBase(object): params: A dictionary of hyperparameter values name: A name for this model to be used as a variable scope """ + def __init__(self, params, name): self.name = name self.params = params @@ -35,8 +37,8 @@ def __call__(self, features, labels, params, mode): return self._build(features, labels, params, mode) def _build(self, features, labels, params, mode): - """Subclasses should implement this method. See the model_fn documentation in - tf.contrib.learn.Estimator class for a more detailed explanation. + """Subclasses should implement this method. See the `model_fn` documentation + in tf.contrib.learn.Estimator class for a more detailed explanation. """ raise NotImplementedError @@ -48,6 +50,7 @@ class Seq2SeqBase(ModelBase): It's mostly used to define the output size of the decoder. Maybe we can somehow put it in the features? """ + def __init__(self, source_vocab_info, target_vocab_info, params, name): super(Seq2SeqBase, self).__init__(params, name) self.source_vocab_info = source_vocab_info @@ -55,117 +58,127 @@ def __init__(self, source_vocab_info, target_vocab_info, params, name): def create_featurizer(self): return featurizers.Seq2SeqFeaturizer( - source_vocab_info=self.source_vocab_info, - target_vocab_info=self.target_vocab_info, - max_seq_len_source=self.params["source.max_seq_len"], - max_seq_len_target=self.params["target.max_seq_len"]) + source_vocab_info=self.source_vocab_info, + target_vocab_info=self.target_vocab_info, + max_seq_len_source=self.params["source.max_seq_len"], + max_seq_len_target=self.params["target.max_seq_len"]) @staticmethod def default_params(): return { - "source.max_seq_len": 40, - "target.max_seq_len": 40, - "embedding.dim": 100, - "optimizer.name": "Adam", - "optimizer.learning_rate": 1e-4, - "optimizer.clip_gradients": 5.0, + "source.max_seq_len": 40, + "target.max_seq_len": 40, + "embedding.dim": 100, + "optimizer.name": "Adam", + "optimizer.learning_rate": 1e-4, + "optimizer.clip_gradients": 5.0, } - def encode_decode(self, source, source_len, decoder_input_fn, target_len, labels=None): + def encode_decode(self, source, source_len, decoder_input_fn, target_len): """Should be implemented by child classes""" raise NotImplementedError - def _create_predictions(self, features, labels, decoder_output, log_perplexities=None): + def _create_predictions(self, + features, + labels, + decoder_output, + log_perplexities=None): """Creates the dictionary of predictions that is returned by the model. """ predictions = { - "logits": decoder_output.logits, - "predictions": decoder_output.predictions, + "logits": decoder_output.logits, + "predictions": decoder_output.predictions, } if log_perplexities is not None: predictions["log_perplexities"] = log_perplexities return predictions - def _build(self, features, labels, params, mode): # Create embedddings source_embedding = tf.get_variable( - "source_embedding", [self.source_vocab_info.total_size, self.params["embedding.dim"]]) + "source_embedding", + [self.source_vocab_info.total_size, self.params["embedding.dim"]]) target_embedding = tf.get_variable( - "target_embedding", [self.target_vocab_info.total_size, self.params["embedding.dim"]]) + "target_embedding", + [self.target_vocab_info.total_size, self.params["embedding.dim"]]) # Embed source - source_embedded = tf.nn.embedding_lookup(source_embedding, features["source_ids"]) + source_embedded = tf.nn.embedding_lookup(source_embedding, + features["source_ids"]) # Graph used for inference if mode == tf.contrib.learn.ModeKeys.INFER: target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START # Embed the "SEQUENCE_START" token initial_input = tf.nn.embedding_lookup( - target_embedding, tf.ones_like(features["source_len"]) * target_start_id) + target_embedding, + tf.ones_like(features["source_len"]) * target_start_id) # Use the embedded prediction as the input to the next time step decoder_input_fn_infer = decoders.DynamicDecoderInputs( - initial_inputs=initial_input, - make_input_fn=lambda x: tf.nn.embedding_lookup(target_embedding, x.predictions)) + initial_inputs=initial_input, + make_input_fn=lambda x: tf.nn.embedding_lookup( + target_embedding, x.predictions) + ) # Decode decoder_output, _ = self.encode_decode( - source=source_embedded, - source_len=features["source_len"], - decoder_input_fn=decoder_input_fn_infer, - target_len=self.params["target.max_seq_len"]) + source=source_embedded, + source_len=features["source_len"], + decoder_input_fn=decoder_input_fn_infer, + target_len=self.params["target.max_seq_len"]) predictions = self._create_predictions( - features=features, - labels=-labels, - decoder_output=decoder_output) + features=features, labels=-labels, decoder_output=decoder_output) return predictions, None, None # Embed target - target_embedded = tf.nn.embedding_lookup(target_embedding, labels["target_ids"]) + target_embedded = tf.nn.embedding_lookup(target_embedding, + labels["target_ids"]) # During training/eval, we have labels and use them for teacher forcing # We don't feed the last SEQUENCE_END token decoder_input_fn_train = decoders.FixedDecoderInputs( - inputs=target_embedded[:, :-1], - sequence_length=labels["target_len"] - 1) + inputs=target_embedded[:, :-1], + sequence_length=labels["target_len"] - 1) decoder_output = self.encode_decode( - source=source_embedded, - source_len=features["source_len"], - decoder_input_fn=decoder_input_fn_train, - target_len=labels["target_len"]) + source=source_embedded, + source_len=features["source_len"], + decoder_input_fn=decoder_input_fn_train, + target_len=labels["target_len"]) - # TODO: For a long sequence the logits are a huge [B * T, vocab_size] matrix - # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we can use map_fn - # or slice the logits to max(sequence_length). Should benchmark this. + # TODO: For a long sequence logits are a huge [B * T, vocab_size] matrix + # which can lead to OOM errors on a GPU. Fixing this is TODO, maybe we + # can use map_fn or slice the logits to max(sequence_length). + # Should benchmark this. # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( - logits=decoder_output.logits[:, :-1, :], - targets=labels["target_ids"][:, 1:], - sequence_length=labels["target_len"] - 1) + logits=decoder_output.logits[:, :-1, :], + targets=labels["target_ids"][:, 1:], + sequence_length=labels["target_len"] - 1) # Calulate per-example losses of shape [B] log_perplexities = tf.div(tf.reduce_sum( - losses, reduction_indices=1), tf.to_float(labels["target_len"] - 1)) + losses, reduction_indices=1), + tf.to_float(labels["target_len"] - 1)) loss = tf.reduce_mean(log_perplexities) train_op = tf.contrib.layers.optimize_loss( - loss=loss, - global_step=tf.contrib.framework.get_global_step(), - learning_rate=self.params["optimizer.learning_rate"], - clip_gradients=self.params["optimizer.clip_gradients"], - optimizer=self.params["optimizer.name"], - summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES) + loss=loss, + global_step=tf.contrib.framework.get_global_step(), + learning_rate=self.params["optimizer.learning_rate"], + clip_gradients=self.params["optimizer.clip_gradients"], + optimizer=self.params["optimizer.name"], + summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES) if mode == tf.contrib.learn.ModeKeys.EVAL: train_op = None predictions = self._create_predictions( - features=features, - labels=labels, - decoder_output=decoder_output, - log_perplexities=log_perplexities) + features=features, + labels=labels, + decoder_output=decoder_output, + log_perplexities=log_perplexities) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. diff --git a/seq2seq/scripts/generate_examples.py b/seq2seq/scripts/generate_examples.py index 56fd05e8..9e61d1a9 100755 --- a/seq2seq/scripts/generate_examples.py +++ b/seq2seq/scripts/generate_examples.py @@ -1,5 +1,4 @@ #! /usr/bin/env python - """ Generates a TFRecords file given sequence-aligned source and target files. @@ -10,16 +9,14 @@ --output_file """ - - import tensorflow as tf tf.flags.DEFINE_string('source_file', None, 'File containing content in source language.') tf.flags.DEFINE_string( - 'target_file', None, - 'File containing content in target language, parallel line by line to the' - 'source file.') + 'target_file', None, + 'File containing content in target language, parallel line by line to the' + 'source file.') tf.flags.DEFINE_string('output_file', None, 'File to output tf.Example TFRecords.') @@ -44,7 +41,8 @@ def build_example(pair_id, source, target): target_tokens = target.strip().split(' ') ex = tf.train.Example() - ex.features.feature['pair_id'].bytes_list.value.append(pair_id.encode('utf-8')) + ex.features.feature['pair_id'].bytes_list.value.append( + pair_id.encode('utf-8')) ex.features.feature['source_len'].int64_list.value.append(len(source_tokens)) ex.features.feature['target_len'].int64_list.value.append(len(target_tokens)) @@ -73,7 +71,8 @@ def write_tfrecords(examples, output_file): def generate_examples(source_file, target_file): - """Creates an iterator of tf.Example records given aligned source and target files. + """Creates an iterator of tf.Example records given aligned + source and target files. Args: source_file: path to file with newline-separated source strings @@ -89,12 +88,12 @@ def generate_examples(source_file, target_file): print('Processed {} records'.format(i)) yield build_example(i, source, target) + def main(unused_argv): """Main function. """ #pylint: disable=unused-argument - examples = generate_examples( - FLAGS.source_file, FLAGS.target_file) + examples = generate_examples(FLAGS.source_file, FLAGS.target_file) write_tfrecords(examples, FLAGS.output_file) diff --git a/seq2seq/scripts/generate_toy_data.py b/seq2seq/scripts/generate_toy_data.py index 3c17cdf2..cf19040a 100755 --- a/seq2seq/scripts/generate_toy_data.py +++ b/seq2seq/scripts/generate_toy_data.py @@ -1,5 +1,4 @@ #! /usr/bin/env python - """ Functions to generate various toy datasets. """ @@ -10,24 +9,35 @@ from sklearn.cross_validation import train_test_split PARSER = argparse.ArgumentParser(description="Generates toy datasets.") -PARSER.add_argument("--vocab_size", type=int, default=100, - help="size of the vocabulary") -PARSER.add_argument("--num_examples", type=int, default=10000, - help="number of examples") -PARSER.add_argument("--min_len", type=int, default=5, - help="minimum sequence length") -PARSER.add_argument("--max_len", type=int, default=40, - help="maximum sequence length") -PARSER.add_argument("--dev_split", type=float, default=0.1, - help="Fraction of data to use for the dev set") -PARSER.add_argument("--type", type=str, default="copy", choices=["copy", "reverse"], - help="Type of dataet to generate. One of \"copy\" or \"reverse\"") -PARSER.add_argument("--output_dir", type=str, - help="path to the output directory", required=True) +PARSER.add_argument( + "--vocab_size", type=int, default=100, help="size of the vocabulary") +PARSER.add_argument( + "--num_examples", type=int, default=10000, help="number of examples") +PARSER.add_argument( + "--min_len", type=int, default=5, help="minimum sequence length") +PARSER.add_argument( + "--max_len", type=int, default=40, help="maximum sequence length") +PARSER.add_argument( + "--dev_split", + type=float, + default=0.1, + help="Fraction of data to use for the dev set") +PARSER.add_argument( + "--type", + type=str, + default="copy", + choices=["copy", "reverse"], + help="Type of dataet to generate. One of \"copy\" or \"reverse\"") +PARSER.add_argument( + "--output_dir", + type=str, + help="path to the output directory", + required=True) ARGS = PARSER.parse_args() VOCABULARY = list([str(x) for x in range(ARGS.vocab_size)]) + def make_copy(num_examples, min_len, max_len): """ Generates a dataset where the target is equal to the source. @@ -43,10 +53,12 @@ def make_copy(num_examples, min_len, max_len): """ for _ in range(num_examples): turn_length = np.random.choice(np.arange(min_len, max_len + 1)) - source_tokens = np.random.choice(list(VOCABULARY), size=turn_length, replace=True) + source_tokens = np.random.choice( + list(VOCABULARY), size=turn_length, replace=True) target_tokens = source_tokens yield " ".join(source_tokens), " ".join(target_tokens) + def make_reverse(num_examples, min_len, max_len): """ Generates a dataset where the target is equal to the source reversed. @@ -62,10 +74,12 @@ def make_reverse(num_examples, min_len, max_len): """ for _ in range(num_examples): turn_length = np.random.choice(np.arange(min_len, max_len + 1)) - source_tokens = np.random.choice(list(VOCABULARY), size=turn_length, replace=True) + source_tokens = np.random.choice( + list(VOCABULARY), size=turn_length, replace=True) target_tokens = source_tokens[::-1] yield " ".join(source_tokens), " ".join(target_tokens) + def write_parallel_text(sources, targets, output_prefix): """ Writes two files where each line corresponds to one example @@ -90,6 +104,7 @@ def write_parallel_text(sources, targets, output_prefix): target_file.write(record + "\n") print("Wrote {}".format(target_filename)) + def main(): """Main function""" @@ -100,16 +115,20 @@ def main(): # Generate dataset examples = list(generate_fn(ARGS.num_examples, ARGS.min_len, ARGS.max_len)) - examples_train, examples_dev = train_test_split(examples, test_size=ARGS.dev_split) + examples_train, examples_dev = train_test_split( + examples, test_size=ARGS.dev_split) os.makedirs(ARGS.output_dir, exist_ok=True) # Write train data train_sources, train_targets = zip(*examples_train) - write_parallel_text(train_sources, train_targets, os.path.join(ARGS.output_dir, "train")) + write_parallel_text(train_sources, train_targets, + os.path.join(ARGS.output_dir, "train")) # Write dev data dev_sources, dev_targets = list(zip(*examples_dev)) - write_parallel_text(dev_sources, dev_targets, os.path.join(ARGS.output_dir, "dev")) + write_parallel_text(dev_sources, dev_targets, + os.path.join(ARGS.output_dir, "dev")) + if __name__ == "__main__": main() diff --git a/seq2seq/scripts/generate_vocab.py b/seq2seq/scripts/generate_vocab.py index a8434a5a..e26e3224 100755 --- a/seq2seq/scripts/generate_vocab.py +++ b/seq2seq/scripts/generate_vocab.py @@ -1,6 +1,5 @@ #! /usr/bin/env python #pylint: disable=invalid-name - """ Generate vocabulary for a tokenized text file. """ @@ -8,15 +7,32 @@ import argparse import collections -parser = argparse.ArgumentParser(description="Generate vocabulary for a tokenized text file.") -parser.add_argument("--input_file", type=str, help="path to the input file", required=True) -parser.add_argument("--output_file", type=str, help="path to the vocabulary file", required=True) -parser.add_argument("--min_frequency", dest="min_frequency", type=int, default=0, - help="Minimum frequency of a word to be included in the vocabulary.") -parser.add_argument("--max_vocab_size", dest="max_vocab_size", type=int, - help="Maximum number of words in the vocabulary") -parser.add_argument("--downcase", dest="downcase", type=bool, - help="If set to true, downcase all text before processing.", default=False) +parser = argparse.ArgumentParser( + description="Generate vocabulary for a tokenized text file.") +parser.add_argument( + "--input_file", type=str, help="path to the input file", required=True) +parser.add_argument( + "--output_file", + type=str, + help="path to the vocabulary file", + required=True) +parser.add_argument( + "--min_frequency", + dest="min_frequency", + type=int, + default=0, + help="Minimum frequency of a word to be included in the vocabulary.") +parser.add_argument( + "--max_vocab_size", + dest="max_vocab_size", + type=int, + help="Maximum number of words in the vocabulary") +parser.add_argument( + "--downcase", + dest="downcase", + type=bool, + help="If set to true, downcase all text before processing.", + default=False) args = parser.parse_args() @@ -35,14 +51,17 @@ # Filter words below the frequency threshold if args.min_frequency > 0: - filtered_words = [(w, c) for w, c in cnt.most_common() if c > args.min_frequency] + filtered_words = [(w, c) for w, c in cnt.most_common() + if c > args.min_frequency] cnt = collections.Counter(dict(filtered_words)) -print("Found {} unique words with frequency > {}.".format(len(cnt), args.min_frequency)) +print("Found {} unique words with frequency > {}.".format( + len(cnt), args.min_frequency)) # Sort words by 1. frequency 2. lexically to break ties word_with_counts = cnt.most_common() -word_with_counts = sorted(word_with_counts, key=lambda x: (x[1], x[0]), reverse=True) +word_with_counts = sorted( + word_with_counts, key=lambda x: (x[1], x[0]), reverse=True) # Take only max-vocab if args.max_vocab_size is not None: @@ -52,4 +71,5 @@ for word, count in word_with_counts: f.write("{}\n".format(word)) -print("Wrote vocab of size {}: {}".format(len(word_with_counts), args.output_file)) +print("Wrote vocab of size {}: {}".format( + len(word_with_counts), args.output_file)) diff --git a/seq2seq/scripts/profile.py b/seq2seq/scripts/profile.py index b746ba30..3034d441 100755 --- a/seq2seq/scripts/profile.py +++ b/seq2seq/scripts/profile.py @@ -1,5 +1,4 @@ #! /usr/bin/env python - """ Script to generates model profiling information """ @@ -18,6 +17,7 @@ FLAGS = tf.flags.FLAGS + def load_metadata(model_dir): """Loads RunMetadata, Graph and OpLog from files """ @@ -58,9 +58,9 @@ def load_metadata(model_dir): def merge_default_with_oplog(graph, op_log=None, run_meta=None): - """Monkeypatch. There currently is a bug in tfprof_logger._merge_default_with_oplog that - prevents it from being used with Python 3. So we override the method manually until the fix - comes in. + """Monkeypatch. There currently is a bug in tfprof_logger that + prevents it from being used with Python 3. So we override the method + manually until the fix comes in. """ tmp_op_log = tfprof_log_pb2.OpLog() # pylint: disable=W0212 @@ -93,6 +93,7 @@ def param_analysis_options(output_dir): options["dump_to_file"] = os.path.join(output_dir, "params.txt") return "scope", options + def micro_anaylsis_options(output_dir): """Options for microsecond analysis """ @@ -105,6 +106,7 @@ def micro_anaylsis_options(output_dir): options["dump_to_file"] = os.path.join(output_dir, "micro.txt") return "graph", options + def flops_analysis_options(output_dir): """Options for FLOPS analysis """ @@ -117,6 +119,7 @@ def flops_analysis_options(output_dir): options["dump_to_file"] = os.path.join(output_dir, "flops.txt") return "scope", options + def device_analysis_options(output_dir): """Options for device placement analysis """ @@ -128,6 +131,7 @@ def device_analysis_options(output_dir): options["dump_to_file"] = os.path.join(output_dir, "device.txt") return "scope", options + def main(_argv): """Main functions. Runs all anaylses.""" # pylint: disable=W0212 @@ -140,22 +144,23 @@ def main(_argv): run_meta, graph, op_log = load_metadata(FLAGS.model_dir) param_arguments = [ - param_analysis_options(output_dir), - micro_anaylsis_options(output_dir), - flops_analysis_options(output_dir), - device_analysis_options(output_dir), + param_analysis_options(output_dir), + micro_anaylsis_options(output_dir), + flops_analysis_options(output_dir), + device_analysis_options(output_dir), ] for tfprof_cmd, params in param_arguments: model_analyzer.print_model_analysis( - graph=graph, - run_meta=run_meta, - op_log=op_log, - tfprof_cmd=tfprof_cmd, - tfprof_options=params) + graph=graph, + run_meta=run_meta, + op_log=op_log, + tfprof_cmd=tfprof_cmd, + tfprof_options=params) if params["dump_to_file"] != "": print("Wrote {}".format(params["dump_to_file"])) + if __name__ == '__main__': tf.app.run() diff --git a/seq2seq/test/attention_test.py b/seq2seq/test/attention_test.py index 2e4712b8..dfc5632d 100644 --- a/seq2seq/test/attention_test.py +++ b/seq2seq/test/attention_test.py @@ -7,10 +7,12 @@ from seq2seq.decoders.attention import AttentionLayer + class AttentionLayerTest(tf.test.TestCase): """ Tests the AttentionLayer module. """ + def setUp(self): super(AttentionLayerTest, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) @@ -30,12 +32,15 @@ def test_shape(self): with self.test_session() as sess: sess.run(tf.global_variables_initializer()) feed_dict = {} - feed_dict[inputs_pl] = np.random.randn(self.batch_size, self.seq_len, self.input_dim) + feed_dict[inputs_pl] = np.random.randn(self.batch_size, self.seq_len, + self.input_dim) feed_dict[state_pl] = np.random.randn(self.batch_size, self.state_dim) scores_, context_ = sess.run([scores, context], feed_dict) - np.testing.assert_array_equal(scores_.shape, [self.batch_size, self.seq_len]) - np.testing.assert_array_equal(context_.shape, [self.batch_size, self.input_dim]) + np.testing.assert_array_equal(scores_.shape, + [self.batch_size, self.seq_len]) + np.testing.assert_array_equal(context_.shape, + [self.batch_size, self.input_dim]) # Scores should sum to 1 scores_sum = np.sum(scores_, axis=1) diff --git a/seq2seq/test/decoder_test.py b/seq2seq/test/decoder_test.py index 71d77349..5e41559c 100644 --- a/seq2seq/test/decoder_test.py +++ b/seq2seq/test/decoder_test.py @@ -8,6 +8,7 @@ from seq2seq.decoders import BasicDecoder, AttentionDecoder, AttentionLayer from seq2seq.decoders import FixedDecoderInputs, DynamicDecoderInputs + class DecoderTests(object): """ A collection of decoder tests. This class should be inherited together with @@ -31,13 +32,15 @@ def create_decoder(self): raise NotImplementedError def test_with_fixed_inputs(self): - inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth]) + inputs = tf.random_normal( + [self.batch_size, self.sequence_length, self.input_depth]) seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32) decoder_input_fn = FixedDecoderInputs(inputs, seq_length) decoder_fn = self.create_decoder() - decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length) + decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, + seq_length) #pylint: disable=E1101 with self.test_session() as sess: @@ -45,26 +48,28 @@ def test_with_fixed_inputs(self): decoder_output_ = sess.run(decoder_output) np.testing.assert_array_equal( - decoder_output_.logits.shape, - [self.batch_size, self.sequence_length, self.vocab_size]) - np.testing.assert_array_equal( - decoder_output_.predictions.shape, - [self.batch_size, self.sequence_length]) + decoder_output_.logits.shape, + [self.batch_size, self.sequence_length, self.vocab_size]) + np.testing.assert_array_equal(decoder_output_.predictions.shape, + [self.batch_size, self.sequence_length]) return decoder_output_ - def test_gradients(self): - inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth]) + inputs = tf.random_normal( + [self.batch_size, self.sequence_length, self.input_depth]) seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length initial_state = self.cell.zero_state(self.batch_size, dtype=tf.float32) - labels = np.random.randint(0, self.vocab_size, [self.batch_size, self.sequence_length]) + labels = np.random.randint(0, self.vocab_size, + [self.batch_size, self.sequence_length]) decoder_input_fn = FixedDecoderInputs(inputs, seq_length) decoder_fn = self.create_decoder() - decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length) + decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, + seq_length) - losses = tf.nn.sparse_softmax_cross_entropy_with_logits(decoder_output.logits, labels) + losses = tf.nn.sparse_softmax_cross_entropy_with_logits( + decoder_output.logits, labels) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) grads_and_vars = optimizer.compute_gradients(tf.reduce_mean(losses)) @@ -78,7 +83,6 @@ def test_gradients(self): return grads_and_vars_ - def test_with_dynamic_inputs(self): initial_input = tf.random_normal([self.batch_size, self.input_depth]) seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length @@ -92,7 +96,8 @@ def make_input_fn(step_output): decoder_input_fn = DynamicDecoderInputs(initial_input, make_input_fn) decoder_fn = self.create_decoder() - decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, seq_length) + decoder_output, _, _ = decoder_fn(decoder_input_fn, initial_state, + seq_length) #pylint: disable=E1101 with self.test_session() as sess: @@ -100,11 +105,10 @@ def make_input_fn(step_output): decoder_output_ = sess.run(decoder_output) np.testing.assert_array_equal( - decoder_output_.logits.shape, - [self.batch_size, self.sequence_length, self.vocab_size]) - np.testing.assert_array_equal( - decoder_output_.predictions.shape, - [self.batch_size, self.sequence_length]) + decoder_output_.logits.shape, + [self.batch_size, self.sequence_length, self.vocab_size]) + np.testing.assert_array_equal(decoder_output_.predictions.shape, + [self.batch_size, self.sequence_length]) return decoder_output @@ -112,6 +116,7 @@ def make_input_fn(step_output): class BasicDecoderTest(tf.test.TestCase, DecoderTests): """Tests the `BasicDecoder` class. """ + def setUp(self): tf.test.TestCase.setUp(self) tf.logging.set_verbosity(tf.logging.INFO) @@ -119,14 +124,15 @@ def setUp(self): def create_decoder(self): return BasicDecoder( - cell=self.cell, - vocab_size=self.vocab_size, - max_decode_length=self.max_decode_length) + cell=self.cell, + vocab_size=self.vocab_size, + max_decode_length=self.max_decode_length) class AttentionDecoderTest(tf.test.TestCase, DecoderTests): """Tests the `AttentionDecoder` class. """ + def setUp(self): tf.test.TestCase.setUp(self) tf.logging.set_verbosity(tf.logging.INFO) @@ -134,28 +140,29 @@ def setUp(self): self.attention_dim = 64 self.input_seq_len = 10 self.attention_inputs = tf.convert_to_tensor( - np.random.randn(self.batch_size, self.input_seq_len, 32), - dtype=tf.float32) + np.random.randn(self.batch_size, self.input_seq_len, 32), + dtype=tf.float32) def create_decoder(self): attention_fn = AttentionLayer(self.attention_dim) return AttentionDecoder( - cell=self.cell, - vocab_size=self.vocab_size, - attention_inputs=self.attention_inputs, - attention_fn=attention_fn, - max_decode_length=self.max_decode_length) + cell=self.cell, + vocab_size=self.vocab_size, + attention_inputs=self.attention_inputs, + attention_fn=attention_fn, + max_decode_length=self.max_decode_length) def test_attention_scores(self): decoder_output_ = self.test_with_fixed_inputs() np.testing.assert_array_equal( - decoder_output_.attention_scores.shape, - [self.batch_size, self.sequence_length, self.input_seq_len]) + decoder_output_.attention_scores.shape, + [self.batch_size, self.sequence_length, self.input_seq_len]) # Make sure the attention scores sum to 1 for each step scores_sum = np.sum(decoder_output_.attention_scores, axis=2) np.testing.assert_array_almost_equal( - scores_sum, np.ones([self.batch_size, self.sequence_length])) + scores_sum, np.ones([self.batch_size, self.sequence_length])) + if __name__ == "__main__": tf.test.main() diff --git a/seq2seq/test/hparams_parser_test.py b/seq2seq/test/hparams_parser_test.py index f1d786f3..3db9d0a2 100644 --- a/seq2seq/test/hparams_parser_test.py +++ b/seq2seq/test/hparams_parser_test.py @@ -5,16 +5,17 @@ import unittest from seq2seq.training import HParamsParser + class HParamsParserTest(unittest.TestCase): """Test for HParamsParser class. """ def test_parse(self): default_params = { - "rnn_dim": 128, - "num_layers": 2, - "rnn_cell_type": "LSTM", - "dropout": 0.8 + "rnn_dim": 128, + "num_layers": 2, + "rnn_cell_type": "LSTM", + "dropout": 0.8 } parser = HParamsParser(default_params) final_params = parser.parse("rnn_dim=256,rnn_cell_type=GRU,dropout=0.77") @@ -25,17 +26,19 @@ def test_parse(self): def test_parse_with_newlines(self): default_params = { - "rnn_dim": 128, - "num_layers": 2, - "rnn_cell_type": "LSTM", - "dropout": 0.8 + "rnn_dim": 128, + "num_layers": 2, + "rnn_cell_type": "LSTM", + "dropout": 0.8 } parser = HParamsParser(default_params) - final_params = parser.parse("\n".join(["rnn_dim=256,", "rnn_cell_type=GRU,", "dropout=0.77"])) + final_params = parser.parse("\n".join( + ["rnn_dim=256,", "rnn_cell_type=GRU,", "dropout=0.77"])) self.assertEqual(final_params["rnn_dim"], 256) self.assertEqual(final_params["rnn_cell_type"], "GRU") self.assertEqual(final_params["dropout"], 0.77) self.assertEqual(final_params["num_layers"], 2) + if __name__ == '__main__': unittest.main() diff --git a/seq2seq/test/inputs_test.py b/seq2seq/test/inputs_test.py index cba282e2..f78954a6 100644 --- a/seq2seq/test/inputs_test.py +++ b/seq2seq/test/inputs_test.py @@ -2,13 +2,13 @@ Unit tests for input-related operations. """ - import tensorflow as tf import numpy as np from seq2seq import inputs from seq2seq.test import utils as test_utils + class VocabInfoTest(tf.test.TestCase): """Tests VocabInfo class""" @@ -31,6 +31,7 @@ def test_vocab_info(self): self.assertEqual(vocab_info.special_vocab.SEQUENCE_END, 5) self.assertEqual(vocab_info.total_size, 6) + class ReadFromDataProviderTest(tf.test.TestCase): """ Tests Data Provider operations. @@ -41,7 +42,8 @@ def setUp(self): tf.logging.set_verbosity(tf.logging.INFO) def test_read_from_data_provider(self): - file = test_utils.create_temp_tfrecords(source="Hello World .", target="Bye") + file = test_utils.create_temp_tfrecords( + source="Hello World .", target="Bye") data_provider = inputs.make_data_provider([file.name], num_epochs=5) features = inputs.read_from_data_provider(data_provider) @@ -53,7 +55,8 @@ def test_read_from_data_provider(self): self.assertEqual(res["source_len"], 3) self.assertEqual(res["target_len"], 1) - np.testing.assert_array_equal(res["source_tokens"].astype("U"), ["Hello", "World", "."]) + np.testing.assert_array_equal(res["source_tokens"].astype("U"), + ["Hello", "World", "."]) np.testing.assert_array_equal(res["target_tokens"].astype("U"), ["Bye"]) @@ -84,13 +87,17 @@ def test_lookup_table(self): sess.run(tf.local_variables_initializer()) sess.run(tf.initialize_all_tables()) - ids = vocab_to_id_table.lookup(tf.convert_to_tensor(["Hello", ".", "Bye", "??", "xxx"])) + ids = vocab_to_id_table.lookup( + tf.convert_to_tensor(["Hello", ".", "Bye", "??", "xxx"])) ids = sess.run(ids) np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3]) - words = id_to_vocab_table.lookup(tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64)) + words = id_to_vocab_table.lookup( + tf.convert_to_tensor( + [0, 1, 2, 3], dtype=tf.int64)) words = sess.run(words) - np.testing.assert_array_equal(words.astype("U"), ["Hello", ".", "Bye", "UNK"]) + np.testing.assert_array_equal( + words.astype("U"), ["Hello", ".", "Bye", "UNK"]) if __name__ == "__main__": diff --git a/seq2seq/test/losses_test.py b/seq2seq/test/losses_test.py index cc115b53..9ea64324 100644 --- a/seq2seq/test/losses_test.py +++ b/seq2seq/test/losses_test.py @@ -6,6 +6,7 @@ import tensorflow as tf import numpy as np + class CrossEntropySequenceLossTest(tf.test.TestCase): """ Test for `sqe2seq.losses.sequence_mask`. @@ -19,11 +20,14 @@ def setUp(self): self.vocab_size = 50 def test_op(self): - logits = np.random.randn(self.batch_size, self.sequence_length, self.vocab_size) + logits = np.random.randn(self.batch_size, self.sequence_length, + self.vocab_size) logits = logits.astype(np.float32) sequence_length = np.array([1, 2, 3, 4]) - targets = np.random.randint(0, self.vocab_size, [self.batch_size, self.sequence_length]) - losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets, sequence_length) + targets = np.random.randint(0, self.vocab_size, + [self.batch_size, self.sequence_length]) + losses = seq2seq_losses.cross_entropy_sequence_loss(logits, targets, + sequence_length) with self.test_session() as sess: losses_ = sess.run(losses) @@ -38,5 +42,6 @@ def test_op(self): np.testing.assert_array_equal(losses_[1, 2:], np.zeros_like(losses_[1, 2:])) np.testing.assert_array_equal(losses_[2, 3:], np.zeros_like(losses_[2, 3:])) + if __name__ == "__main__": tf.test.main() diff --git a/seq2seq/test/models_test.py b/seq2seq/test/models_test.py index c2cfc49c..9c57d455 100644 --- a/seq2seq/test/models_test.py +++ b/seq2seq/test/models_test.py @@ -14,6 +14,7 @@ import tensorflow as tf import numpy as np + class EncoderDecoderTests(tf.test.TestCase): """Base class for EncoderDecoder tests. Tests for specific classes should inherit from this and tf.test.TestCase. @@ -36,34 +37,44 @@ def tearDown(self): self.vocab_file.close() def create_model(self): - """Creates the model class to be tested. Subclasses must implement this method. + """Creates model class to be tested. Subclasses must implement this method. """ self.skipTest("Base module should not be tested.") def _create_example(self): """Creates example data for a test""" - source = np.random.randn(self.batch_size, self.max_decode_length, self.input_depth) + source = np.random.randn(self.batch_size, self.max_decode_length, + self.input_depth) source_len = np.random.randint(0, self.max_decode_length, [self.batch_size]) - target_len = np.random.randint(0, self.max_decode_length * 2, [self.batch_size]) - target = np.random.randn(self.batch_size, np.max(target_len), self.input_depth) - labels = np.random.randint(0, self.vocab_size, [self.batch_size, np.max(target_len) - 1]) - - example_ = namedtuple("Example", ["source", "source_len", "target", "target_len", "labels"]) + target_len = np.random.randint(0, self.max_decode_length * 2, + [self.batch_size]) + target = np.random.randn(self.batch_size, + np.max(target_len), self.input_depth) + labels = np.random.randint(0, self.vocab_size, + [self.batch_size, np.max(target_len) - 1]) + + example_ = namedtuple( + "Example", ["source", "source_len", "target", "target_len", "labels"]) return example_(source, source_len, target, target_len, labels) def test_forward_pass(self): """Tests model forward pass by checking the shape of the outputs.""" ex = self._create_example() decoder_input_fn = FixedDecoderInputs( - inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32), - sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) + inputs=tf.convert_to_tensor( + ex.target, dtype=tf.float32), + sequence_length=tf.convert_to_tensor( + ex.target_len, dtype=tf.int32)) model = self.create_model() decoder_output = model.encode_decode( - source=tf.convert_to_tensor(ex.source, dtype=tf.float32), - source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32), - decoder_input_fn=decoder_input_fn, - target_len=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) + source=tf.convert_to_tensor( + ex.source, dtype=tf.float32), + source_len=tf.convert_to_tensor( + ex.source_len, dtype=tf.int32), + decoder_input_fn=decoder_input_fn, + target_len=tf.convert_to_tensor( + ex.target_len, dtype=tf.int32)) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) @@ -73,13 +84,13 @@ def test_forward_pass(self): expected_decode_len = np.minimum(ex.target_len, max_decode_length) # Assert shapes are correct + np.testing.assert_array_equal(decoder_output_.logits.shape, [ + self.batch_size, np.max(expected_decode_len), + model.target_vocab_info.total_size + ]) np.testing.assert_array_equal( - decoder_output_.logits.shape, - [self.batch_size, np.max(expected_decode_len), model.target_vocab_info.total_size]) - np.testing.assert_array_equal( - decoder_output_.predictions.shape, - [self.batch_size, np.max(expected_decode_len)]) - + decoder_output_.predictions.shape, + [self.batch_size, np.max(expected_decode_len)]) def test_inference(self): """Tests model inference by feeding dynamic inputs based on an embedding @@ -87,54 +98,65 @@ def test_inference(self): model = self.create_model() ex = self._create_example() - embeddings = tf.get_variable("W_embed", [model.target_vocab_info.total_size, self.input_depth]) + embeddings = tf.get_variable( + "W_embed", [model.target_vocab_info.total_size, self.input_depth]) + def make_input_fn(step_output): """Looks up the predictions in the embeddings. """ return tf.nn.embedding_lookup(embeddings, step_output.predictions) decoder_input_fn = DynamicDecoderInputs( - initial_inputs=tf.zeros([self.batch_size, self.input_depth], dtype=tf.float32), - make_input_fn=make_input_fn) + initial_inputs=tf.zeros( + [self.batch_size, self.input_depth], dtype=tf.float32), + make_input_fn=make_input_fn) decoder_output = model.encode_decode( - source=tf.convert_to_tensor(ex.source, dtype=tf.float32), - source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32), - decoder_input_fn=decoder_input_fn, - target_len=self.max_decode_length) + source=tf.convert_to_tensor( + ex.source, dtype=tf.float32), + source_len=tf.convert_to_tensor( + ex.source_len, dtype=tf.int32), + decoder_input_fn=decoder_input_fn, + target_len=self.max_decode_length) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) decoder_output_ = sess.run(decoder_output) # Assert shapes are correct - np.testing.assert_array_equal( - decoder_output_.logits.shape, - [self.batch_size, self.max_decode_length, model.target_vocab_info.total_size]) - np.testing.assert_array_equal( - decoder_output_.predictions.shape, - [self.batch_size, self.max_decode_length]) + np.testing.assert_array_equal(decoder_output_.logits.shape, [ + self.batch_size, self.max_decode_length, + model.target_vocab_info.total_size + ]) + np.testing.assert_array_equal(decoder_output_.predictions.shape, + [self.batch_size, self.max_decode_length]) def test_gradients(self): """Ensures the parameter gradients can be computed and are not NaN """ ex = self._create_example() decoder_input_fn = FixedDecoderInputs( - inputs=tf.convert_to_tensor(ex.target, dtype=tf.float32), - sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) + inputs=tf.convert_to_tensor( + ex.target, dtype=tf.float32), + sequence_length=tf.convert_to_tensor( + ex.target_len, dtype=tf.int32)) model = self.create_model() decoder_output = model.encode_decode( - source=tf.convert_to_tensor(ex.source, dtype=tf.float32), - source_len=tf.convert_to_tensor(ex.source_len, dtype=tf.int32), - decoder_input_fn=decoder_input_fn, - target_len=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) + source=tf.convert_to_tensor( + ex.source, dtype=tf.float32), + source_len=tf.convert_to_tensor( + ex.source_len, dtype=tf.int32), + decoder_input_fn=decoder_input_fn, + target_len=tf.convert_to_tensor( + ex.target_len, dtype=tf.int32)) # Get a loss to optimize losses = seq2seq_losses.cross_entropy_sequence_loss( - logits=decoder_output.logits, - targets=tf.ones_like(decoder_output.predictions), - sequence_length=tf.convert_to_tensor(ex.target_len, dtype=tf.int32)) + logits=decoder_output.logits, + targets=tf.ones_like(decoder_output.predictions), + sequence_length=tf.convert_to_tensor( + ex.target_len, dtype=tf.int32)) mean_loss = tf.reduce_mean(losses) optimizer = tf.train.AdamOptimizer() @@ -154,15 +176,18 @@ def test_pipeline(self): target_len = self.max_decode_length + 10 source = " ".join(np.random.choice(self.vocab_list, source_len)) target = " ".join(np.random.choice(self.vocab_list, target_len)) - tfrecords_file = test_utils.create_temp_tfrecords(source=source, target=target) + tfrecords_file = test_utils.create_temp_tfrecords( + source=source, target=target) # Build model graph model = self.create_model() featurizer = model.create_featurizer() data_provider = lambda: inputs.make_data_provider([tfrecords_file.name]) - input_fn = training_utils.create_input_fn(data_provider, featurizer, self.batch_size) + input_fn = training_utils.create_input_fn(data_provider, featurizer, + self.batch_size) features, labels = input_fn() - predictions, loss, train_op = model(features, labels, None, tf.contrib.learn.ModeKeys.TRAIN) + predictions, loss, train_op = model(features, labels, None, + tf.contrib.learn.ModeKeys.TRAIN) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) @@ -176,31 +201,34 @@ def test_pipeline(self): max_decode_length = model.params["target.max_seq_len"] expected_decode_len = np.minimum(target_len + 1, max_decode_length) - np.testing.assert_array_equal( - predictions_["logits"].shape, - [self.batch_size, expected_decode_len, model.target_vocab_info.total_size]) - np.testing.assert_array_equal( - predictions_["predictions"].shape, - [self.batch_size, expected_decode_len]) + np.testing.assert_array_equal(predictions_["logits"].shape, [ + self.batch_size, expected_decode_len, model.target_vocab_info.total_size + ]) + np.testing.assert_array_equal(predictions_["predictions"].shape, + [self.batch_size, expected_decode_len]) self.assertFalse(np.isnan(loss_)) tfrecords_file.close() + class TestBasicSeq2Seq(EncoderDecoderTests): """Tests the seq2seq.models.BasicSeq2Seq model. """ + def setUp(self): super(TestBasicSeq2Seq, self).setUp() def create_model(self): return BasicSeq2Seq( - source_vocab_info=self.vocab_info, - target_vocab_info=self.vocab_info, - params=BasicSeq2Seq.default_params()) + source_vocab_info=self.vocab_info, + target_vocab_info=self.vocab_info, + params=BasicSeq2Seq.default_params()) + class TestAttentionSeq2Seq(EncoderDecoderTests): """Tests the seq2seq.models.AttentionSeq2Seq model. """ + def setUp(self): super(TestAttentionSeq2Seq, self).setUp() self.encoder_rnn_cell = tf.nn.rnn_cell.LSTMCell(32) @@ -209,9 +237,10 @@ def setUp(self): def create_model(self): return AttentionSeq2Seq( - source_vocab_info=self.vocab_info, - target_vocab_info=self.vocab_info, - params=AttentionSeq2Seq.default_params()) + source_vocab_info=self.vocab_info, + target_vocab_info=self.vocab_info, + params=AttentionSeq2Seq.default_params()) + if __name__ == "__main__": tf.test.main() diff --git a/seq2seq/test/rnn_encoder_test.py b/seq2seq/test/rnn_encoder_test.py index 8d6bd82d..9e93553e 100644 --- a/seq2seq/test/rnn_encoder_test.py +++ b/seq2seq/test/rnn_encoder_test.py @@ -5,7 +5,8 @@ import tensorflow as tf import numpy as np -from seq2seq.encoders.rnn_encoder import UnidirectionalRNNEncoder, BidirectionalRNNEncoder +from seq2seq.encoders import rnn_encoder + class UnidirectionalRNNEncoderTest(tf.test.TestCase): """ @@ -21,10 +22,12 @@ def setUp(self): self.cell = tf.nn.rnn_cell.LSTMCell(32) def test_encode(self): - inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth]) - example_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length + inputs = tf.random_normal( + [self.batch_size, self.sequence_length, self.input_depth]) + example_length = tf.ones( + self.batch_size, dtype=tf.int32) * self.sequence_length - encode_fn = UnidirectionalRNNEncoder(self.cell) + encode_fn = rnn_encoder.UnidirectionalRNNEncoder(self.cell) encoder_output = encode_fn(inputs, example_length) with self.test_session() as sess: @@ -32,12 +35,14 @@ def test_encode(self): encoder_output_ = sess.run(encoder_output) np.testing.assert_array_equal( - encoder_output_.outputs.shape, [self.batch_size, self.sequence_length, self.cell.output_size]) - self.assertIsInstance(encoder_output_.final_state, tf.nn.rnn_cell.LSTMStateTuple) - np.testing.assert_array_equal( - encoder_output_.final_state.h.shape, [self.batch_size, self.cell.output_size]) - np.testing.assert_array_equal( - encoder_output_.final_state.c.shape, [self.batch_size, self.cell.output_size]) + encoder_output_.outputs.shape, + [self.batch_size, self.sequence_length, self.cell.output_size]) + self.assertIsInstance(encoder_output_.final_state, + tf.nn.rnn_cell.LSTMStateTuple) + np.testing.assert_array_equal(encoder_output_.final_state.h.shape, + [self.batch_size, self.cell.output_size]) + np.testing.assert_array_equal(encoder_output_.final_state.c.shape, + [self.batch_size, self.cell.output_size]) class BidirectionalRNNEncoderTest(tf.test.TestCase): @@ -54,10 +59,12 @@ def setUp(self): self.cell = tf.nn.rnn_cell.LSTMCell(32) def test_encode(self): - inputs = tf.random_normal([self.batch_size, self.sequence_length, self.input_depth]) - example_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length + inputs = tf.random_normal( + [self.batch_size, self.sequence_length, self.input_depth]) + example_length = tf.ones( + self.batch_size, dtype=tf.int32) * self.sequence_length - encode_fn = BidirectionalRNNEncoder(self.cell) + encode_fn = rnn_encoder.BidirectionalRNNEncoder(self.cell) encoder_output = encode_fn(inputs, example_length) with self.test_session() as sess: @@ -65,19 +72,22 @@ def test_encode(self): encoder_output_ = sess.run(encoder_output) np.testing.assert_array_equal( - encoder_output_.outputs.shape, - [self.batch_size, self.sequence_length, self.cell.output_size*2]) + encoder_output_.outputs.shape, + [self.batch_size, self.sequence_length, self.cell.output_size * 2]) + + self.assertIsInstance(encoder_output_.final_state[0], + tf.nn.rnn_cell.LSTMStateTuple) + self.assertIsInstance(encoder_output_.final_state[1], + tf.nn.rnn_cell.LSTMStateTuple) + np.testing.assert_array_equal(encoder_output_.final_state[0].h.shape, + [self.batch_size, self.cell.output_size]) + np.testing.assert_array_equal(encoder_output_.final_state[0].c.shape, + [self.batch_size, self.cell.output_size]) + np.testing.assert_array_equal(encoder_output_.final_state[1].h.shape, + [self.batch_size, self.cell.output_size]) + np.testing.assert_array_equal(encoder_output_.final_state[1].c.shape, + [self.batch_size, self.cell.output_size]) - self.assertIsInstance(encoder_output_.final_state[0], tf.nn.rnn_cell.LSTMStateTuple) - self.assertIsInstance(encoder_output_.final_state[1], tf.nn.rnn_cell.LSTMStateTuple) - np.testing.assert_array_equal( - encoder_output_.final_state[0].h.shape, [self.batch_size, self.cell.output_size]) - np.testing.assert_array_equal( - encoder_output_.final_state[0].c.shape, [self.batch_size, self.cell.output_size]) - np.testing.assert_array_equal( - encoder_output_.final_state[1].h.shape, [self.batch_size, self.cell.output_size]) - np.testing.assert_array_equal( - encoder_output_.final_state[1].c.shape, [self.batch_size, self.cell.output_size]) if __name__ == "__main__": tf.test.main() diff --git a/seq2seq/test/utils.py b/seq2seq/test/utils.py index fb4e3f36..d5eab8e5 100644 --- a/seq2seq/test/utils.py +++ b/seq2seq/test/utils.py @@ -5,6 +5,7 @@ from seq2seq.scripts import generate_examples from seq2seq import inputs + def create_temp_tfrecords(source, target): """ Creates a temporary TFRecords file. @@ -51,6 +52,7 @@ def create_next_input_fn_for_test(source, target): A function that reads from a temporary file """ file = create_temp_tfrecords(source, target) + def next_input_fn(): """ The input function that is returned. diff --git a/seq2seq/training/featurizers.py b/seq2seq/training/featurizers.py index 7a6de041..18e1a366 100644 --- a/seq2seq/training/featurizers.py +++ b/seq2seq/training/featurizers.py @@ -6,9 +6,11 @@ from seq2seq import inputs from seq2seq.graph_module import GraphModule + class Seq2SeqFeaturizer(GraphModule): - """Takes raw tensors read from a TFRecods file and transforms them into feature and labels - dictionaries that can be fed to model functions. In particular, this featurizer: + """Takes raw tensors read from a TFRecods file and transforms them into + feature and labels dictionaries that can be fed to model functions. + In particular, this featurizer: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids @@ -21,8 +23,12 @@ class Seq2SeqFeaturizer(GraphModule): source_vocab_info: a `seq2seq.inputs.VocabInfo` for the target vocab """ - def __init__(self, source_vocab_info, target_vocab_info, - max_seq_len_source=None, max_seq_len_target=None, name="sequence_input"): + def __init__(self, + source_vocab_info, + target_vocab_info, + max_seq_len_source=None, + max_seq_len_target=None, + name="sequence_input"): super(Seq2SeqFeaturizer, self).__init__(name) self.source_vocab_info = source_vocab_info self.target_vocab_info = target_vocab_info @@ -32,8 +38,8 @@ def __init__(self, source_vocab_info, target_vocab_info, def _build(self, input_dict): output_dict = input_dict.copy() - # TODO: Ideally we also should have the "special vocabulary" in our lookup table. - # How to best do this? Maybe create a temporary files that appends the special vocab? + # TODO: Ideally we should have the "special vocabulary" in our lookup table. + # How to best do this? Create a temporary files with the special vocab? # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, _ = \ @@ -52,43 +58,55 @@ def _build(self, input_dict): tf.add_to_collection("target_id_to_vocab", target_id_to_vocab) if self.max_seq_len_source is not None: - output_dict["source_tokens"] = output_dict["source_tokens"][:self.max_seq_len_source - 1] - output_dict["source_len"] = tf.minimum(output_dict["source_len"], self.max_seq_len_source - 1) + output_dict["source_tokens"] = output_dict[ + "source_tokens"][:self.max_seq_len_source - 1] + output_dict["source_len"] = tf.minimum(output_dict["source_len"], + self.max_seq_len_source - 1) if self.max_seq_len_target is not None: - output_dict["target_tokens"] = output_dict["target_tokens"][:self.max_seq_len_target - 2] - output_dict["target_len"] = tf.minimum(output_dict["target_len"], self.max_seq_len_target - 2) + output_dict["target_tokens"] = output_dict[ + "target_tokens"][:self.max_seq_len_target - 2] + output_dict["target_len"] = tf.minimum(output_dict["target_len"], + self.max_seq_len_target - 2) # Look up the source and target in the vocabulary - output_dict["source_ids"] = source_vocab_to_id.lookup(output_dict["source_tokens"]) - output_dict["target_ids"] = target_vocab_to_id.lookup(output_dict["target_tokens"]) + output_dict["source_ids"] = source_vocab_to_id.lookup(output_dict[ + "source_tokens"]) + output_dict["target_ids"] = target_vocab_to_id.lookup(output_dict[ + "target_tokens"]) # Append SEQUENCE_END token to the source - output_dict["source_ids"] = tf.concat( - 0, [output_dict["source_ids"], [self.source_vocab_info.special_vocab.SEQUENCE_END]]) + output_dict["source_ids"] = tf.concat(0, [ + output_dict["source_ids"], + [self.source_vocab_info.special_vocab.SEQUENCE_END] + ]) output_dict["source_tokens"] = tf.concat( - 0, [output_dict["source_tokens"], ["SEQUENCE_END"]]) + 0, [output_dict["source_tokens"], ["SEQUENCE_END"]]) output_dict["source_len"] += 1 # Prepend SEQUENCE_START token to the target output_dict["target_ids"] = tf.concat( - 0, [[self.target_vocab_info.special_vocab.SEQUENCE_START], output_dict["target_ids"]]) + 0, [[self.target_vocab_info.special_vocab.SEQUENCE_START], + output_dict["target_ids"]]) output_dict["target_tokens"] = tf.concat( - 0, [["SEQUENCE_START"], output_dict["target_tokens"]]) + 0, [["SEQUENCE_START"], output_dict["target_tokens"]]) output_dict["target_len"] += 1 # Append SEQUENCE_END token to the target - output_dict["target_ids"] = tf.concat( - 0, [output_dict["target_ids"], [self.target_vocab_info.special_vocab.SEQUENCE_END]]) + output_dict["target_ids"] = tf.concat(0, [ + output_dict["target_ids"], + [self.target_vocab_info.special_vocab.SEQUENCE_END] + ]) output_dict["target_tokens"] = tf.concat( - 0, [output_dict["target_tokens"], ["SEQUENCE_END"]]) + 0, [output_dict["target_tokens"], ["SEQUENCE_END"]]) output_dict["target_len"] += 1 # Cast to int32 output_dict["source_len"] = tf.to_int32(output_dict["source_len"]) output_dict["target_len"] = tf.to_int32(output_dict["target_len"]) output_dict["target_start_id"] = tf.to_int32( - self.target_vocab_info.special_vocab.SEQUENCE_START) - output_dict["target_end_id"] = tf.to_int32(self.target_vocab_info.special_vocab.SEQUENCE_END) + self.target_vocab_info.special_vocab.SEQUENCE_START) + output_dict["target_end_id"] = tf.to_int32( + self.target_vocab_info.special_vocab.SEQUENCE_END) # Add summaries tf.summary.histogram("source_len", output_dict["source_len"]) diff --git a/seq2seq/training/hooks.py b/seq2seq/training/hooks.py index a6fea895..b7d36cab 100644 --- a/seq2seq/training/hooks.py +++ b/seq2seq/training/hooks.py @@ -7,15 +7,18 @@ from tensorflow.contrib.learn import basic_session_run_hooks, session_run_hook from tensorflow.python.client import timeline -class SecondOrStepTimer(basic_session_run_hooks.basic_session_run_hooks._SecondOrStepTimer): + +class SecondOrStepTimer( + basic_session_run_hooks.basic_session_run_hooks._SecondOrStepTimer): """Helper class to count both seconds and steps. """ pass class MetadataCaptureHook(session_run_hook.SessionRunHook): - """A hook to capture metadata for a single step. Useful for performance debugging. - It performs a full trace and saves run_metadata and Chrome timeline information to a file. + """A hook to capture metadata for a single step. + Useful for performance debugging. It performs a full trace and saves + run_metadata and Chrome timeline information to a file. Args: output_dir: Directory to write file(s) to @@ -57,9 +60,9 @@ def after_run(self, _run_context, run_values): # Save tfprof op log tf.contrib.tfprof.tfprof_logger.write_op_log( - graph=tf.get_default_graph(), - log_dir=self.output_dir, - run_meta=run_values.run_metadata) + graph=tf.get_default_graph(), + log_dir=self.output_dir, + run_meta=run_values.run_metadata) tf.logging.info("Saved op log to %s", self.output_dir) self._iter += 1 @@ -69,15 +72,18 @@ class TrainSampleHook(session_run_hook.SessionRunHook): """Occasionally samples predictions from the training run and prints them. Args: - every_n_secs: Sample predictions every N seconds. If set, `every_n_steps` must be None. - every_n_steps: Sample predictions every N steps. If set, `every_n_secs` must be None. + every_n_secs: Sample predictions every N seconds. + If set, `every_n_steps` must be None. + every_n_steps: Sample predictions every N steps. + If set, `every_n_secs` must be None. """ #pylint: disable=missing-docstring def __init__(self, every_n_secs=None, every_n_steps=None): super(TrainSampleHook, self).__init__() - self._timer = SecondOrStepTimer(every_secs=every_n_secs, every_steps=every_n_steps) + self._timer = SecondOrStepTimer( + every_secs=every_n_secs, every_steps=every_n_steps) self.predictions_dict = {} self.features_dict = {} self.labels_dict = {} @@ -90,25 +96,29 @@ def begin(self): self._iter_count = 0 # TODO: Is there a nicer way? # See https://github.com/dennybritz/seq2seq/issues/21 - self.predictions_dict = dict(zip( - tf.get_collection("model_output_keys"), - tf.get_collection("model_output_values"))) - self.features_dict = dict(zip( - tf.get_collection("features_keys"), - tf.get_collection("features_values"))) - self.labels_dict = dict(zip( - tf.get_collection("labels_keys"), - tf.get_collection("labels_values"))) + self.predictions_dict = dict( + zip( + tf.get_collection("model_output_keys"), + tf.get_collection("model_output_values"))) + self.features_dict = dict( + zip( + tf.get_collection("features_keys"), + tf.get_collection("features_values"))) + self.labels_dict = dict( + zip( + tf.get_collection("labels_keys"), tf.get_collection( + "labels_values"))) self.target_id_to_vocab = tf.get_collection("target_id_to_vocab")[0] - self.predicted_words = self.target_id_to_vocab.lookup(self.predictions_dict["predictions"]) + self.predicted_words = self.target_id_to_vocab.lookup(self.predictions_dict[ + "predictions"]) def before_run(self, _run_context): self._should_trigger = self._timer.should_trigger_for_step(self._iter_count) if self._should_trigger: fetches = { - "predicted_words": self.predicted_words, - "target_words": self.labels_dict["target_tokens"], - "target_len": self.labels_dict["target_len"] + "predicted_words": self.predicted_words, + "target_words": self.labels_dict["target_tokens"], + "target_len": self.labels_dict["target_len"] } return session_run_hook.SessionRunArgs(fetches) return None @@ -121,7 +131,9 @@ def after_run(self, _run_context, run_values): # Convert dict of lists to list of dicts result_dict = run_values.results - result_dicts = [dict(zip(result_dict, t)) for t in zip(*result_dict.values())] + result_dicts = [ + dict(zip(result_dict, t)) for t in zip(*result_dict.values()) + ] # Print results tf.logging.info("Sampling Predictions (Prediction followed by Target)") @@ -136,13 +148,13 @@ def after_run(self, _run_context, run_values): self._timer.update_last_triggered_step(self._iter_count) - class PrintModelAnalysisHook(session_run_hook.SessionRunHook): - """A SessionRunHook that writes the parameters of the model to a file and stdout. + """Writes the parameters of the model to a file and stdout. Args: filename: The file path to write the model analysis to. """ + #pylint: disable=missing-docstring def __init__(self, filename=None): self.filename = filename @@ -152,7 +164,7 @@ def begin(self): opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS opts['dump_to_file'] = os.path.abspath(self.filename) tf.contrib.tfprof.model_analyzer.print_model_analysis( - tf.get_default_graph(), tfprof_options=opts) + tf.get_default_graph(), tfprof_options=opts) # Print the model analysis with open(self.filename, "r") as file: diff --git a/seq2seq/training/hparams_parser.py b/seq2seq/training/hparams_parser.py index 60fd3c9c..fa5cdcf1 100644 --- a/seq2seq/training/hparams_parser.py +++ b/seq2seq/training/hparams_parser.py @@ -3,6 +3,7 @@ import argparse + class HParamsParser(object): """Pases a comma-separated string of hyperaprameters """ diff --git a/seq2seq/training/train.py b/seq2seq/training/train.py index 696ea15c..90d38e39 100755 --- a/seq2seq/training/train.py +++ b/seq2seq/training/train.py @@ -25,28 +25,30 @@ tf.flags.DEFINE_string("hparams", None, "overwrite hyperparameter values") tf.flags.DEFINE_string("model", "BasicSeq2Seq", "model class") tf.flags.DEFINE_string("output_dir", None, "directory to write to") -tf.flags.DEFINE_integer("save_checkpoints_secs", 300, "save checkpoint every N seconds") +tf.flags.DEFINE_integer("save_checkpoints_secs", 300, + "save checkpoint every N seconds") tf.flags.DEFINE_string("schedule", None, """Estimator function to call, defaults to train_and_evaluate for local run""") - - tf.flags.DEFINE_integer("train_steps", None, "maximum number of training steps") tf.flags.DEFINE_integer("eval_steps", 100, "maxmum number of eval steps") -tf.flags.DEFINE_integer("eval_every_n_steps", 1000, "evaluate after this many training steps") -tf.flags.DEFINE_integer("sample_every_n_steps", 500, "sample training predictions every N steps") +tf.flags.DEFINE_integer("eval_every_n_steps", 1000, + "evaluate after this many training steps") +tf.flags.DEFINE_integer("sample_every_n_steps", 500, + "sample training predictions every N steps") FLAGS = tf.flags.FLAGS tf.logging.set_verbosity(tf.logging.INFO) + def create_experiment(output_dir): """ Creates a new Experiment instance. Args: - output_dir: Will be used as the output directory for model checkpoints and summaries. + output_dir: Output directory for model checkpoints and summaries. """ # Load vocabulary info @@ -74,9 +76,9 @@ def create_experiment(output_dir): # Create model model = model_class( - source_vocab_info=source_vocab_info, - target_vocab_info=target_vocab_info, - params=hparams) + source_vocab_info=source_vocab_info, + target_vocab_info=target_vocab_info, + params=hparams) featurizer = model.create_featurizer() bucket_boundaries = None @@ -85,48 +87,49 @@ def create_experiment(output_dir): # Create input functions train_input_fn = training_utils.create_input_fn( - train_data_provider, featurizer, FLAGS.batch_size, bucket_boundaries=bucket_boundaries) - eval_input_fn = training_utils.create_input_fn( - dev_data_provider, featurizer, FLAGS.batch_size) + train_data_provider, + featurizer, + FLAGS.batch_size, + bucket_boundaries=bucket_boundaries) + eval_input_fn = training_utils.create_input_fn(dev_data_provider, featurizer, + FLAGS.batch_size) def model_fn(features, labels, params, mode): """Builds the model graph""" return model(features, labels, params, mode) estimator = tf.contrib.learn.estimator.Estimator( - model_fn=model_fn, - model_dir=output_dir) + model_fn=model_fn, model_dir=output_dir) # Create training Hooks - # validation_monitor = tf.contrib.learn.monitors.ValidationMonitor( - # input_fn=eval_input_fn, eval_steps=FLAGS.eval_steps, every_n_steps=FLAGS.eval_every_n_steps) model_analysis_hook = hooks.PrintModelAnalysisHook( - filename=os.path.join(estimator.model_dir, "model_analysis.txt")) + filename=os.path.join(estimator.model_dir, "model_analysis.txt")) train_sample_hook = hooks.TrainSampleHook( - every_n_steps=FLAGS.sample_every_n_steps) + every_n_steps=FLAGS.sample_every_n_steps) metadata_hook = hooks.MetadataCaptureHook( - output_dir=os.path.join(estimator.model_dir, "metadata"), step=10) + output_dir=os.path.join(estimator.model_dir, "metadata"), step=10) train_monitors = [model_analysis_hook, train_sample_hook, metadata_hook] experiment = tf.contrib.learn.experiment.Experiment( - estimator=estimator, - train_input_fn=train_input_fn, - eval_input_fn=eval_input_fn, - min_eval_frequency=FLAGS.eval_every_n_steps, - train_steps=FLAGS.train_steps, - eval_steps=FLAGS.eval_steps, - train_monitors=train_monitors) + estimator=estimator, + train_input_fn=train_input_fn, + eval_input_fn=eval_input_fn, + min_eval_frequency=FLAGS.eval_every_n_steps, + train_steps=FLAGS.train_steps, + eval_steps=FLAGS.eval_steps, + train_monitors=train_monitors) return experiment + def main(_argv): """The entrypoint for the script""" if not FLAGS.output_dir: FLAGS.output_dir = tempfile.mkdtemp() - learn_runner.run( - experiment_fn=create_experiment, - output_dir=FLAGS.output_dir, - schedule=FLAGS.schedule) + learn_runner.run(experiment_fn=create_experiment, + output_dir=FLAGS.output_dir, + schedule=FLAGS.schedule) + if __name__ == "__main__": tf.app.run() diff --git a/seq2seq/training/utils.py b/seq2seq/training/utils.py index 6998e165..88441b1a 100644 --- a/seq2seq/training/utils.py +++ b/seq2seq/training/utils.py @@ -1,19 +1,27 @@ """Miscellaneous training utility functions. """ -from seq2seq.inputs import read_from_data_provider +from seq2seq.inputs import read_from_data_provider import tensorflow as tf -def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0, + +def get_rnn_cell(cell_type, + num_units, + num_layers=1, + dropout_input_keep_prob=1.0, dropout_output_keep_prob=1.0): """Creates a new RNN Cell. Args: - cell_type: A cell lass name defined in `tf.nn.rnn_cell`, e.g. `LSTMCell` or `GRUCell` + cell_type: A cell lass name defined in `tf.nn.rnn_cell`, + e.g. `LSTMCell` or `GRUCell` num_units: Number of cell units - num_layers: Number of layers. The cell will be wrapped with `tf.nn.rnn_cell.MultiRNNCell` - dropout_input_keep_prob: Dropout keep probability applied to the input of cell *at each layer* - dropout_output_keep_prob: Dropout keep probability applied to the output of cell *at each layer* + num_layers: Number of layers. The cell will be wrapped with + `tf.nn.rnn_cell.MultiRNNCell` + dropout_input_keep_prob: Dropout keep probability applied + to the input of cell *at each layer* + dropout_output_keep_prob: Dropout keep probability applied + to the output of cell *at each layer* Returns: An instance of `tf.nn.rnn_cell.RNNCell`. @@ -24,9 +32,9 @@ def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0 if dropout_input_keep_prob < 1.0 or dropout_output_keep_prob < 1.0: cell = tf.nn.rnn_cell.DropoutWrapper( - cell=cell, - input_keep_prob=dropout_input_keep_prob, - output_keep_prob=dropout_output_keep_prob) + cell=cell, + input_keep_prob=dropout_input_keep_prob, + output_keep_prob=dropout_output_keep_prob) if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers) @@ -34,23 +42,30 @@ def get_rnn_cell(cell_type, num_units, num_layers=1, dropout_input_keep_prob=1.0 return cell -def create_input_fn(data_provider_fn, featurizer_fn, batch_size, bucket_boundaries=None): +def create_input_fn(data_provider_fn, + featurizer_fn, + batch_size, + bucket_boundaries=None): """Creates an input function that can be used with tf.learn estimators. - Note that you must pass "factory funcitons" for both the data provider and featurizer - to ensure that everything will be created as part of the same graph. + Note that you must pass "factory funcitons" for both the data provider and + featurizer to ensure that everything will be created in the same graph. Args: - data_provider_fn: A function that creates a data provider instance to read from. + data_provider_fn: Function that creates a data provider to read from. An instance of `tf.contrib.slim.data_provider.DataProvider`. - featurizer_fn: A function taht creates a featurizer function which takes tensors - returned by the data provider and transfroms them into a (features, labels) tuple. - batch_size: Create batches of this size. A queue to hold a reasonable number of batches in - memory is created. - bucket_boundaries: int list, increasing non-negative numbers. If None, no bucket is performed. + featurizer_fn: A function that creates a featurizer function + which takes tensors returned by the data provider and transfroms them + into a (features, labels) tuple. + batch_size: Create batches of this size. A queue to hold a + reasonable number of batches in memory is created. + bucket_boundaries: int list, increasing non-negative numbers. + If None, no bucket is performed. Returns: - An input function that returns (feature_batch, labels_batch) tuples when called. + An input function that returns `(feature_batch, labels_batch)` + tuples when called. """ + def input_fn(): """Creates features and labels. """ @@ -65,36 +80,35 @@ def input_fn(): if bucket_boundaries: bucket_num, batch = tf.contrib.training.bucket_by_sequence_length( - input_length=features_and_labels["source_len"], - bucket_boundaries=bucket_boundaries, - tensors=features_and_labels, - batch_size=batch_size, - keep_input=features_and_labels["target_len"] >= 1, - dynamic_pad=True, - capacity=5000 + 16 * batch_size, - name="bucket_queue") + input_length=features_and_labels["source_len"], + bucket_boundaries=bucket_boundaries, + tensors=features_and_labels, + batch_size=batch_size, + keep_input=features_and_labels["target_len"] >= 1, + dynamic_pad=True, + capacity=5000 + 16 * batch_size, + name="bucket_queue") tf.summary.histogram("buckets", bucket_num) else: # Filter out examples with target_len < 1 - slice_end = tf.cond( - features_and_labels["target_len"] >= 1, - lambda: tf.constant(1), - lambda: tf.constant(0)) + slice_end = tf.cond(features_and_labels["target_len"] >= 1, + lambda: tf.constant(1), lambda: tf.constant(0)) features_and_labels = { - k: tf.expand_dims(v, 0)[0:slice_end] - for k, v in features_and_labels.items() + k: tf.expand_dims(v, 0)[0:slice_end] + for k, v in features_and_labels.items() } batch = tf.train.batch( - tensors=features_and_labels, - enqueue_many=True, - batch_size=batch_size, - dynamic_pad=True, - capacity=5000 + 16 * batch_size, - name="batch_queue") + tensors=features_and_labels, + enqueue_many=True, + batch_size=batch_size, + dynamic_pad=True, + capacity=5000 + 16 * batch_size, + name="batch_queue") # Separate features and labels again features_batch = {k: batch[k] for k in feature_keys} labels_batch = {k: batch[k] for k in label_keys} return features_batch, labels_batch + return input_fn