### setup

In [1]:
import os
import random
import codecs
import time

In [2]:
import numpy as np

In [3]:
import tensorflow as tf
from tensorflow.python.platform import gfile
from tensorflow.core.protobuf import saver_pb2

In [4]:
tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
                          "Learning rate decays by this much.")
tf.app.flags.DEFINE_float("max_gradient_norm", 5.0,
                          "Clip gradients to this norm.")
tf.app.flags.DEFINE_integer("batch_size", 64,
                            "Batch size to use during training.")
tf.app.flags.DEFINE_integer("size", 64, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 2, "Number of layers in the model.")
tf.app.flags.DEFINE_string("model", None, "Training directory.")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
                            "How many training steps to do per checkpoint.")
tf.app.flags.DEFINE_boolean("interactive", False,
                            "Set to True for interactive decoding.")
tf.app.flags.DEFINE_string("evaluate", "", "Count word error rate for file.")
tf.app.flags.DEFINE_string("decode", "", "Decode file.")
tf.app.flags.DEFINE_string("output", "", "Decoding result file.")
tf.app.flags.DEFINE_string("train", "", "Train dictionary.")
tf.app.flags.DEFINE_string("valid", "", "Development dictionary.")
tf.app.flags.DEFINE_string("test", "", "Test dictionary.")
tf.app.flags.DEFINE_integer("max_steps", 0,
                            "How many training steps to do until stop training"
                            " (0: no limit).")
tf.app.flags.DEFINE_boolean("reinit", False,
                            "Set to True for training from scratch.")
tf.app.flags.DEFINE_string(
    "optimizer", "sgd", "Optimizer type: sgd, adam, rms-prop. Default: sgd.")

FLAGS = tf.app.flags.FLAGS

In [5]:
# Special vocabulary symbols - we always put them at the start.
_PAD = "_PAD"
_GO = "_GO"
_EOS = "_EOS"
_UNK = "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

### model

In [6]:
class Seq2SeqModel(object):
    """Sequence-to-sequence model with attention and for multiple buckets.

    This class implements a multi-layer recurrent neural network as encoder,
    and an attention-based decoder. This is the same as the model described in
    this paper: http://arxiv.org/abs/1412.7449 - please look there for details,
    or into the seq2seq library for complete model implementation.
    This class also allows to use GRU cells in addition to LSTM cells, and
    sampled softmax to handle large output vocabulary size. A single-layer
    version of this model, but with bi-directional encoder, was presented in
      http://arxiv.org/abs/1409.0473
    and sampled softmax is described in Section 3 of the following paper.
      http://arxiv.org/abs/1412.2007
    """

    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 optimizer="sgd",
                 dtype=tf.float32):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the model.
          dtype: the data type to use to store internal variables.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(
            float(learning_rate), trainable=False, dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.optimizer = optimizer

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary
        # size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable(
                "proj_w", [self.target_vocab_size, size], dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable(
                "proj_b", [self.target_vocab_size], dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(local_w_t, local_b, local_inputs, labels,
                                               num_samples, self.target_vocab_size),
                    dtype)
            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.contrib.rnn.core_rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = tf.contrib.rnn.core_rnn_cell.MultiRNNCell([single_cell] *
                                                             num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="encoder{0}".format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(dtype, shape=[None],
                                                      name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                   for i in range(len(self.decoder_inputs) - 1)]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs, self.decoder_inputs, targets,
                self.target_weights, buckets, lambda x, y: seq2seq_f(
                    x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for
            # decoding.
            if output_projection is not None:
                for b in range(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(
                            output, output_projection[0]) + output_projection[1]
                        for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs, self.decoder_inputs, targets,
                self.target_weights, buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            if self.optimizer == 'sgd':
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif self.optimizer == 'adam':
                opt = tf.train.AdamOptimizer(self.learning_rate)
            elif self.optimizer == 'rms-prop':
                opt = tf.train.RMSPropOptimizer(self.learning_rate)
            else:
                raise ValueError('No such type of optimizer: %s' %
                                 self.optimizer)
            for b in range(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                                 max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(opt.apply_gradients(
                    zip(clipped_gradients, params), global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())

    def step(self, session, encoder_inputs, decoder_inputs, target_weights,
             bucket_id, forward_only):
        """Run a step of the model feeding the given inputs.

        Args:
          session: tensorflow session to use.
          encoder_inputs: list of numpy int vectors to feed as encoder inputs.
          decoder_inputs: list of numpy int vectors to feed as decoder inputs.
          target_weights: list of numpy float vectors to feed as target weights.
          bucket_id: which bucket of the model to use.
          forward_only: whether to do the backward step or only forward.

        Returns:
          A triple consisting of gradient norm (or None if we did not do backward),
          average perplexity, and the outputs.

        Raises:
          ValueError: if length of encoder_inputs, decoder_inputs, or
            target_weights disagrees with bucket size for the specified bucket_id.
        """

        # Check if the sizes match.
        encoder_size, decoder_size = self.buckets[bucket_id]
        if len(encoder_inputs) != encoder_size:
            raise ValueError("Encoder length must be equal to the one in bucket,"
                             " %d != %d." % (len(encoder_inputs), encoder_size))
        if len(decoder_inputs) != decoder_size:
            raise ValueError("Decoder length must be equal to the one in bucket,"
                             " %d != %d." % (len(decoder_inputs), decoder_size))
        if len(target_weights) != decoder_size:
            raise ValueError("Weights length must be equal to the one in bucket,"
                             " %d != %d." % (len(target_weights), decoder_size))

        # Input feed: encoder inputs, decoder inputs, target_weights, as
        # provided.
        input_feed = {}
        for l in range(encoder_size):
            input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
        for l in range(decoder_size):
            input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
            input_feed[self.target_weights[l].name] = target_weights[l]

        # Since our targets are decoder inputs shifted by one, we need one
        # more.
        last_target = self.decoder_inputs[decoder_size].name
        input_feed[last_target] = np.zeros([len(encoder_inputs[0])],
                                           dtype=np.int32)

        # Output feed: depends on whether we do a backward step or not.
        if not forward_only:
            output_feed = [self.updates[bucket_id],  # Update Op that does SGD.
                           self.gradient_norms[bucket_id],  # Gradient norm.
                           self.losses[bucket_id]]  # Loss for this batch.
        else:
            output_feed = [self.losses[bucket_id]]  # Loss for this batch.
            for l in range(decoder_size):  # Output logits.
                output_feed.append(self.outputs[bucket_id][l])

        outputs = session.run(output_feed, input_feed)
        if not forward_only:
            # Gradient norm, loss, no outputs.
            return outputs[1], outputs[2], None
        else:
            # No gradient norm, loss, outputs.
            return None, outputs[0], outputs[1:]

    def get_batch(self, data, bucket_id):
        """Get a random batch of data from the specified bucket, prepare for step.

        To feed data in step(..) it must be a list of batch-major vectors, while
        data here contains single length-major cases. So the main logic of this
        function is to re-index data cases to be in the proper format for feeding.

        Args:
          data: a tuple of size len(self.buckets) in which each element contains
            lists of pairs of input and output data that we use to create a batch.
          bucket_id: integer, which bucket to get the batch for.

        Returns:
          The triple (encoder_inputs, decoder_inputs, target_weights) for
          the constructed batch that has the proper format to call step(...) later.
        """
        encoder_size, decoder_size = self.buckets[bucket_id]
        encoder_inputs, decoder_inputs = [], []

        # Get a random batch of encoder and decoder inputs from data,
        # pad them if needed, reverse encoder inputs and add GO to decoder.
        for _ in range(self.batch_size):
            encoder_input, decoder_input = random.choice(data[bucket_id])

            # Encoder inputs are padded and then reversed.
            encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

            # Decoder inputs get an extra "GO" symbol, and are padded then.
            decoder_pad_size = decoder_size - len(decoder_input) - 1
            decoder_inputs.append([GO_ID] + decoder_input +
                                  [PAD_ID] * decoder_pad_size)
        return self.__create_batch_major_vecs(encoder_size, decoder_size,
                                              encoder_inputs, decoder_inputs)

    def get_eval_set_batch(self, data, bucket_id, from_row_idx):
        """Get a batch from data with rows started with from_row_idx.

        To feed data in step(..) it must be a list of batch-major vectors, while
        data here contains single length-major cases. So the main logic of this
        function is to re-index data cases to be in the proper format for feeding.

        Args:
          data: a tuple of size len(self.buckets) in which each element contains
            lists of pairs of input and output data that we use to create a batch.
          bucket_id: integer, which bucket to get the batch for.

        Returns:
          The triple (encoder_inputs, decoder_inputs, target_weights) for
          the constructed batch that has the proper format to call step(...) later.
        """
        encoder_size, decoder_size = self.buckets[bucket_id]
        encoder_inputs, decoder_inputs = [], []
        batch_row_idx = 0

        # Get a batch of encoder and decoder inputs from data,
        # pad them if needed, reverse encoder inputs and add GO to decoder.
        while (from_row_idx + batch_row_idx < len(data[bucket_id])
               and batch_row_idx < self.batch_size):
            encoder_input, decoder_input =\
                data[bucket_id][from_row_idx + batch_row_idx]

            # Encoder inputs are padded and then reversed.
            encoder_pad = [PAD_ID] * (encoder_size - len(encoder_input))
            encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))

            # Decoder inputs get an extra "GO" symbol, and are padded then.
            decoder_pad_size = decoder_size - len(decoder_input) - 1
            decoder_inputs.append([GO_ID] + decoder_input +
                                  [PAD_ID] * decoder_pad_size)
            batch_row_idx += 1
        return self.__create_batch_major_vecs(encoder_size,
                                              decoder_size,
                                              encoder_inputs,
                                              decoder_inputs)

    def __create_batch_major_vecs(self, encoder_size, decoder_size,
                                  encoder_inputs, decoder_inputs):
        # Now we create batch-major vectors from the data selected above.
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

        # Batch encoder inputs are just re-indexed encoder_inputs.
        for length_idx in range(encoder_size):
            batch_encoder_inputs.append(
                np.array([encoder_inputs[batch_idx][length_idx]
                          for batch_idx in range(len(encoder_inputs))],
                         dtype=np.int32))

        # Batch decoder inputs are re-indexed decoder_inputs, we create
        # weights.
        for length_idx in range(decoder_size):
            batch_decoder_inputs.append(
                np.array([decoder_inputs[batch_idx][length_idx]
                          for batch_idx in range(len(encoder_inputs))],
                         dtype=np.int32))

            # Create target_weights to be 0 for targets that are padding.
            batch_weight = np.ones(len(encoder_inputs), dtype=np.float32)
            for batch_idx in range(len(encoder_inputs)):
                # We set weight to 0 if the corresponding target is a PAD symbol.
                # The corresponding target is decoder_input shifted by 1
                # forward.
                if length_idx < decoder_size - 1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == decoder_size - 1 or target == PAD_ID:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)
        return batch_encoder_inputs, batch_decoder_inputs, batch_weights

In [7]:
class G2PModel(object):
    """Grapheme-to-Phoneme translation model class.

    Constructor parameters (for training mode only):
      train_lines: Train dictionary;
      valid_lines: Development dictionary;
      test_lines: Test dictionary.

    Attributes:
      gr_vocab: Grapheme vocabulary;
      ph_vocab: Phoneme vocabulary;
      train_set: Training buckets: words and sounds are mapped to ids;
      valid_set: Validation buckets: words and sounds are mapped to ids;
      session: Tensorflow session;
      model: Tensorflow Seq2Seq model for G2PModel object.
      train: Train method.
      interactive: Interactive decode method;
      evaluate: Word-Error-Rate counting method;
      decode: Decode file method.
    """
    # We use a number of buckets and pad to the closest one for efficiency.
    # See seq2seq_model.Seq2SeqModel for details of how they work.
    _BUCKETS = [(5, 10), (10, 15), (40, 50)]

    def __init__(self, model_dir):
        """Initialize model directory."""
        self.model_dir = model_dir

    def load_decode_model(self):
        """Load G2P model and initialize or load parameters in session."""
        if not os.path.exists(os.path.join(self.model_dir, 'checkpoint')):
            raise RuntimeError("Model not found in %s" % self.model_dir)

        self.batch_size = 1  # We decode one word at a time.
        # Load model parameters.
        num_layers, size = load_params(self.model_dir)
        # Load vocabularies
        print("Loading vocabularies from %s" % self.model_dir)
        self.gr_vocab = load_vocabulary(
            os.path.join(self.model_dir, "vocab.grapheme"))
        self.ph_vocab = load_vocabulary(
            os.path.join(self.model_dir, "vocab.phoneme"))

        self.rev_ph_vocab =\
            load_vocabulary(os.path.join(self.model_dir, "vocab.phoneme"),
                            reverse=True)

        self.session = tf.Session()

        # Restore model.
        print("Creating %d layers of %d units." % (num_layers, size))
        self.model = Seq2SeqModel(
            len(self.gr_vocab),
            len(self.ph_vocab),
            self._BUCKETS,
            size,
            num_layers,
            0,
            self.batch_size,
            0,
            0,
            forward_only=True)
        self.model.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        # Check for saved models and restore them.
        print("Reading model parameters from %s" % self.model_dir)
        self.model.saver.restore(self.session,
                                 os.path.join(self.model_dir, "model"))

    def __put_into_buckets(self, source, target):
        """Put data from source and target into buckets.

        Args:
          source: data with ids for graphemes;
          target: data with ids for phonemes;
            it must be aligned with the source data: n-th line contains the desired
            output for n-th line from the source.

        Returns:
          data_set: a list of length len(_BUCKETS); data_set[n] contains a list of
            (source, target) pairs read from the provided data that fit
            into the n-th bucket, i.e., such that len(source) < _BUCKETS[n][0] and
            len(target) < _BUCKETS[n][1]; source and target are lists of ids.
        """

        # By default unk to unk
        data_set = [[[[4], [4]]] for _ in self._BUCKETS]

        for source_ids, target_ids in zip(source, target):
            target_ids.append(EOS_ID)
            for bucket_id, (source_size,
                            target_size) in enumerate(self._BUCKETS):
                if len(source_ids) < source_size and len(
                        target_ids) < target_size:
                    data_set[bucket_id].append([source_ids, target_ids])
                    break
        return data_set

    def prepare_data(self, train_path, valid_path, test_path):
        """Prepare train/validation/test sets. Create or load vocabularies."""
        # Prepare data.
        print("Preparing G2P data")
        train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\
            self.ph_vocab, self.test_lines =\
            prepare_g2p_data(self.model_dir, train_path, valid_path,
                             test_path)
        # Read data into buckets and compute their sizes.
        print("Reading development and training data.")
        self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids)
        self.train_set = self.__put_into_buckets(train_gr_ids, train_ph_ids)

        self.rev_ph_vocab = dict([(x, y)
                                  for (y, x) in enumerate(self.ph_vocab)])

    def __prepare_model(self, params):
        """Prepare G2P model for training."""

        self.params = params

        self.session = tf.Session()

        # Prepare model.
        print("Creating model with parameters:")
        print(params)
        self.model = Seq2SeqModel(
            len(self.gr_vocab),
            len(self.ph_vocab),
            self._BUCKETS,
            self.params.size,
            self.params.num_layers,
            self.params.max_gradient_norm,
            self.params.batch_size,
            self.params.learning_rate,
            self.params.lr_decay_factor,
            forward_only=False,
            optimizer=self.params.optimizer)
        self.model.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

    def load_train_model(self, params):
        """Load G2P model for continuing train."""
        # Check for saved model.
        if not os.path.exists(os.path.join(self.model_dir, 'checkpoint')):
            raise RuntimeError("Model not found in %s" % self.model_dir)

        # Load model parameters.
        params.num_layers, params.size = load_params(self.model_dir)

        # Prepare data and G2P Model.
        self.__prepare_model(params)

        # Restore model.
        print("Reading model parameters from %s" % self.model_dir)
        self.model.saver.restore(self.session,
                                 os.path.join(self.model_dir, "model"))

    def create_train_model(self, params):
        """Create G2P model for train from scratch."""
        # Save model parameters.
        save_params(params.num_layers, params.size, self.model_dir)

        # Prepare data and G2P Model
        self.__prepare_model(params)

        print("Created model with fresh parameters.")
        self.session.run(tf.global_variables_initializer())

    def train(self):
        """Train a gr->ph translation model using G2P data."""

        train_bucket_sizes = [
            len(self.train_set[b]) for b in range(len(self._BUCKETS))
        ]
        train_total_size = float(sum(train_bucket_sizes))
        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, train_loss = 0.0, 0.0
        current_step, num_iter_wo_improve = 0, 0
        prev_train_losses, prev_valid_losses = [], []
        num_iter_cover_train = int(
            sum(train_bucket_sizes) / self.params.batch_size /
            self.params.steps_per_checkpoint)
        while (self.params.max_steps == 0 or
               self.model.global_step.eval(self.session) <=
               self.params.max_steps):
            # Get a batch and make a step.
            start_time = time.time()
            step_loss = self.__calc_step_loss(train_buckets_scale)
            step_time += (time.time() - start_time) / \
                self.params.steps_per_checkpoint
            train_loss += step_loss / self.params.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run
            # evals.
            if current_step % self.params.steps_per_checkpoint == 0:
                # Print statistics for the previous steps.
                train_ppx = math.exp(
                    train_loss) if train_loss < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" % (self.model.global_step.eval(self.session),
                              self.model.learning_rate.eval(self.session),
                              step_time, train_ppx))
                eval_loss = self.__calc_eval_loss()
                eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float(
                    'inf')
                print("  eval: perplexity %.2f" % (eval_ppx))
                # Decrease learning rate if no improvement was seen on train set
                # over last 3 times.
                if (len(prev_train_losses) > 2 and
                        train_loss > max(prev_train_losses[-3:])):
                    self.session.run(self.model.learning_rate_decay_op)

                if (len(prev_valid_losses) > 0 and
                        eval_loss <= min(prev_valid_losses)):
                    # Save checkpoint and zero timer and loss.
                    self.model.saver.save(
                        self.session,
                        os.path.join(self.model_dir, "model"),
                        write_meta_graph=False)

                if (len(prev_valid_losses) > 0 and
                        eval_loss >= min(prev_valid_losses)):
                    num_iter_wo_improve += 1
                else:
                    num_iter_wo_improve = 0

                if num_iter_wo_improve > num_iter_cover_train * 2:
                    print(
                        "No improvement over last %d times. Training will stop after %d"
                        "iterations if no improvement was seen." %
                        (num_iter_wo_improve,
                         num_iter_cover_train - num_iter_wo_improve))

                # Stop train if no improvement was seen on validation set
                # over last 3 epochs.
                if num_iter_wo_improve > num_iter_cover_train * 3:
                    break

                prev_train_losses.append(train_loss)
                prev_valid_losses.append(eval_loss)
                step_time, train_loss = 0.0, 0.0

        print('Training done.')
        with tf.Graph().as_default():
            g2p_model_eval = G2PModel(self.model_dir)
            g2p_model_eval.load_decode_model()
            g2p_model_eval.evaluate(self.test_lines)

    def __calc_step_loss(self, train_buckets_scale):
        """Choose a bucket according to data distribution. We pick a random number
        in [0, 1] and use the corresponding interval in train_buckets_scale.
        """
        random_number_01 = np.random.random_sample()
        bucket_id = min([
            i for i in range(len(train_buckets_scale))
            if train_buckets_scale[i] > random_number_01
        ])

        # Get a batch and make a step.
        encoder_inputs, decoder_inputs, target_weights = self.model.get_batch(
            self.train_set, bucket_id)
        _, step_loss, _ = self.model.step(self.session, encoder_inputs,
                                          decoder_inputs, target_weights,
                                          bucket_id, False)
        return step_loss

    def __calc_eval_loss(self):
        """Run evals on development set and print their perplexity.
        """
        eval_loss, num_iter_total = 0.0, 0.0
        for bucket_id in range(len(self._BUCKETS)):
            num_iter_cover_valid = int(
                math.ceil(
                    len(self.valid_set[bucket_id]) / self.params.batch_size))
            num_iter_total += num_iter_cover_valid
            for batch_id in range(num_iter_cover_valid):
                encoder_inputs, decoder_inputs, target_weights =\
                    self.model.get_eval_set_batch(self.valid_set, bucket_id,
                                                  batch_id * self.params.batch_size)
                _, eval_batch_loss, _ = self.model.step(
                    self.session, encoder_inputs, decoder_inputs,
                    target_weights, bucket_id, True)
                eval_loss += eval_batch_loss
        eval_loss = eval_loss / \
            num_iter_total if num_iter_total > 0 else float('inf')
        return eval_loss

    def decode_word(self, word):
        """Decode input word to sequence of phonemes.

        Args:
          word: input word;

        Returns:
          phonemes: decoded phoneme sequence for input word;
        """
        # Check if all graphemes attended in vocabulary
        gr_absent = [gr for gr in word if gr not in self.gr_vocab]
        if gr_absent:
            print("Symbols '%s' are not in vocabulary" %
                  "','".join(gr_absent).encode('utf-8'))
            return ""

        # Get token-ids for the input word.
        token_ids = [self.gr_vocab.get(s, UNK_ID) for s in word]
        # Which bucket does it belong to?
        bucket_id = min([
            b for b in range(len(self._BUCKETS))
            if self._BUCKETS[b][0] > len(token_ids)
        ])
        # Get a 1-element batch to feed the word to the model.
        encoder_inputs, decoder_inputs, target_weights = self.model.get_batch({
            bucket_id: [(token_ids, [])]
        }, bucket_id)
        # Get output logits for the word.
        _, _, output_logits = self.model.step(self.session, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id, True)
        # This is a greedy decoder - outputs are just argmaxes of
        # output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
        # If there is an EOS symbol in outputs, cut them at that point.
        if EOS_ID in outputs:
            outputs = outputs[:outputs.index(EOS_ID)]
        # Phoneme sequence corresponding to outputs.
        return " ".join([self.rev_ph_vocab[output] for output in outputs])

    def interactive(self):
        """Decode word from standard input.
        """
        while True:
            try:
                word = input("> ")
                if not issubclass(type(word), str):
                    word = str(word, encoding='utf-8', errors='replace')
            except EOFError:
                break
            if not word:
                break
            print(self.decode_word(word))

    def calc_error(self, dictionary):
        """Calculate a number of prediction errors.
        """
        errors = 0
        for word, pronunciations in dictionary.items():
            hyp = self.decode_word(word)
            if hyp not in pronunciations:
                errors += 1
        return errors

    def evaluate(self, test_lines):
        """Calculate and print out word error rate (WER) and Accuracy
           on test sample.

        Args:
          test_lines: List of test dictionary. Each element of list must be String
                    containing word and its pronounciation (e.g., "word W ER D");
        """
        test_dic = collect_pronunciations(test_lines)

        if len(test_dic) < 1:
            print("Test dictionary is empty")
            return

        print('Beginning calculation word error rate (WER) on test sample.')
        errors = self.calc_error(test_dic)

        print("Words: %d" % len(test_dic))
        print("Errors: %d" % errors)
        print("WER: %.3f" % (float(errors) / len(test_dic)))
        print("Accuracy: %.3f" % float(1 - (errors / len(test_dic))))

    def decode(self, decode_lines, output_file=None):
        """Decode words from file.

        Returns:
          if [--output output_file] pointed out, write decoded word sequences in
          this file. Otherwise, print decoded words in standard output.
        """
        phoneme_lines = []

        # Decode from input file.
        if output_file:
            for word in decode_lines:
                word = word.strip()
                phonemes = self.decode_word(word)
                output_file.write(word)
                output_file.write(' ')
                output_file.write(phonemes)
                output_file.write('\n')
                phoneme_lines.append(phonemes)
            output_file.close()
        else:
            for word in decode_lines:
                word = word.strip()
                phonemes = self.decode_word(word)
                print(word + ' ' + phonemes)
                phoneme_lines.append(phonemes)
        return phoneme_lines

### utils

In [8]:
class TrainingParams(object):
    """Class with training parameters."""

    def __init__(self, flags=None):
        if flags:
            self.learning_rate = flags.learning_rate
            self.lr_decay_factor = flags.learning_rate_decay_factor
            self.max_gradient_norm = flags.max_gradient_norm
            self.batch_size = flags.batch_size
            self.size = flags.size
            self.num_layers = flags.num_layers
            self.steps_per_checkpoint = flags.steps_per_checkpoint
            self.max_steps = flags.max_steps
            self.optimizer = flags.optimizer
        else:
            self.learning_rate = 0.5
            self.lr_decay_factor = 0.99
            self.max_gradient_norm = 5.0
            self.batch_size = 64
            self.size = 64
            self.num_layers = 2
            self.steps_per_checkpoint = 200
            self.max_steps = 0
            self.optimizer = "sgd"

    def __str__(self):
        return ("Learning rate:        {}\n"
                "LR decay factor:      {}\n"
                "Max gradient norm:    {}\n"
                "Batch size:           {}\n"
                "Size of layer:        {}\n"
                "Number of layers:     {}\n"
                "Steps per checkpoint: {}\n"
                "Max steps:            {}\n"
                "Optimizer:            {}\n").format(
            self.learning_rate,
            self.lr_decay_factor,
            self.max_gradient_norm,
            self.batch_size,
            self.size,
            self.num_layers,
            self.steps_per_checkpoint,
            self.max_steps,
            self.optimizer)

In [9]:
def create_vocabulary(data):
    """Create vocabulary from input data.
    Input data is assumed to contain one word per line.

    Args:
      data: word list that will be used to create vocabulary.

    Rerurn:
      vocab: vocabulary dictionary. In this dictionary keys are symbols
             and values are their indexes.
    """
    vocab = {}
    for line in data:
        for item in line:
            if item in vocab:
                vocab[item] += 1
            else:
                vocab[item] = 1
    vocab_list = _START_VOCAB + sorted(vocab)
    vocab = dict([(x, y) for (y, x) in enumerate(vocab_list)])
    return vocab


def save_vocabulary(vocab, vocabulary_path):
    """Save vocabulary file in vocabulary_path.
    We write vocabulary to vocabulary_path in a one-token-per-line format,
    so that later token in the first line gets id=0, second line gets id=1,
    and so on.

    Args:
      vocab: vocabulary dictionary.
      vocabulary_path: path where the vocabulary will be created.

    """
    print("Creating vocabulary %s" % (vocabulary_path))
    with codecs.open(vocabulary_path, "w", "utf-8") as vocab_file:
        for symbol in sorted(vocab, key=vocab.get):
            vocab_file.write(symbol + '\n')


def load_vocabulary(vocabulary_path, reverse=False):
    """Load vocabulary from file.
    We assume the vocabulary is stored one-item-per-line, so a file:
      d
      c
    will result in a vocabulary {"d": 0, "c": 1}, and this function may
    also return the reversed-vocabulary [0, 1].

    Args:
      vocabulary_path: path to the file containing the vocabulary.
      reverse: flag managing what type of vocabulary to return.

    Returns:
      the vocabulary (a dictionary mapping string to integers), or
      if set reverse to True the reversed vocabulary (a list, which reverses
      the vocabulary mapping).

    Raises:
      ValueError: if the provided vocabulary_path does not exist.
    """
    rev_vocab = []
    with codecs.open(vocabulary_path, "r", "utf-8") as vocab_file:
        rev_vocab.extend(vocab_file.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    if reverse:
        return rev_vocab
    else:
        return dict([(x, y) for (y, x) in enumerate(rev_vocab)])


def save_params(num_layers, size, model_dir):
    """Save model parameters in model_dir directory.

    Returns:
      num_layers: Number of layers in the model;
      size: Size of each model layer.
    """
    # Save model's architecture
    with open(os.path.join(model_dir, "model.params"), 'w') as param_file:
        param_file.write("num_layers:" + str(num_layers) + "\n")
        param_file.write("size:" + str(size))


def load_params(model_path):
    """Load parameters from 'model.params' file.

    Returns:
      num_layers: Number of layers in the model;
      size: Size of each model layer.
    """
    # Checking model's architecture for decode processes.
    if gfile.Exists(os.path.join(model_path, "model.params")):
        with open(os.path.join(model_path, "model.params")) as f:
            params = f.readlines()
        for line in params:
            split_line = line.strip().split(":")
            if split_line[0] == "num_layers":
                num_layers = int(split_line[1])
            if split_line[0] == "size":
                size = int(split_line[1])
    return num_layers, size


def symbols_to_ids(symbols, vocab):
    """Turn symbols into ids sequence using given vocabulary file.

    Args:
      symbols: input symbols sequence;
      vocab: vocabulary (a dictionary mapping string to integers).

    Returns:
      ids: output sequence of ids.
    """
    ids = [vocab.get(s, UNK_ID) for s in symbols]
    return ids


def split_to_grapheme_phoneme(inp_dictionary):
    """Split input dictionary into two separate lists with graphemes and phonemes.

    Args:
      inp_dictionary: input dictionary.
    """
    graphemes, phonemes = [], []
    for line in inp_dictionary:
        split_line = line.strip().split()
        if len(split_line) > 1:
            graphemes.append(list(split_line[0]))
            phonemes.append(split_line[1:])
    return graphemes, phonemes


def collect_pronunciations(dic_lines):
    '''Create dictionary mapping word to its different pronounciations.
    '''
    dic = {}
    for line in dic_lines:
        lst = line.strip().split()
        if len(lst) > 1:
            if lst[0] not in dic:
                dic[lst[0]] = [" ".join(lst[1:])]
            else:
                dic[lst[0]].append(" ".join(lst[1:]))
        elif len(lst) == 1:
            print("WARNING: No phonemes for word '%s' line ignored" % (lst[0]))
    return dic


def split_dictionary(train_path, valid_path=None, test_path=None):
    """Split source dictionary to train, validation and test sets.
    """
    with codecs.open(train_path, "r", "utf-8") as f:
        source_dic = f.readlines()
    train_dic, valid_dic, test_dic = [], [], []
    if valid_path:
        with codecs.open(valid_path, "r", "utf-8") as f:
            valid_dic = f.readlines()
    if test_path:
        with codecs.open(test_path, "r", "utf-8") as f:
            test_dic = f.readlines()

    dic = collect_pronunciations(source_dic)

    # Split dictionary to train, validation and test (if not assigned).
    for i, word in enumerate(dic):
        for pronunciations in dic[word]:
            if i % 20 == 0 and not valid_path:
                valid_dic.append(word + ' ' + pronunciations)
            elif (i % 20 == 1 or i % 20 == 2) and not test_path:
                test_dic.append(word + ' ' + pronunciations)
            else:
                train_dic.append(word + ' ' + pronunciations)
    return train_dic, valid_dic, test_dic


def prepare_g2p_data(model_dir, train_path, valid_path, test_path):
    """Create vocabularies into model_dir, create ids data lists.

    Args:
      model_dir: directory in which the data sets will be stored;
      train_path: path to training dictionary;
      valid_path: path to validation dictionary;
      test_path: path to test dictionary.

    Returns:
      A tuple of 6 elements:
        (1) Sequence of ids for Grapheme training data-set,
        (2) Sequence of ids for Phoneme training data-set,
        (3) Sequence of ids for Grapheme development data-set,
        (4) Sequence of ids for Phoneme development data-set,
        (5) Grapheme vocabulary,
        (6) Phoneme vocabulary.
    """
    # Create train, validation and test sets.
    train_dic, valid_dic, test_dic = split_dictionary(train_path, valid_path,
                                                      test_path)
    # Split dictionaries into two separate lists with graphemes and phonemes.
    train_gr, train_ph = split_to_grapheme_phoneme(train_dic)
    valid_gr, valid_ph = split_to_grapheme_phoneme(valid_dic)

    # Load/Create vocabularies.
    if (model_dir
        and os.path.exists(os.path.join(model_dir, "vocab.grapheme"))
            and os.path.exists(os.path.join(model_dir, "vocab.phoneme"))):
        print("Loading vocabularies from %s" % model_dir)
        ph_vocab = load_vocabulary(os.path.join(model_dir, "vocab.phoneme"))
        gr_vocab = load_vocabulary(os.path.join(model_dir, "vocab.grapheme"))

    else:
        ph_vocab = create_vocabulary(train_ph)
        gr_vocab = create_vocabulary(train_gr)

        if model_dir:
            os.makedirs(model_dir)
            save_vocabulary(ph_vocab, os.path.join(model_dir, "vocab.phoneme"))
            save_vocabulary(gr_vocab, os.path.join(
                model_dir, "vocab.grapheme"))

    # Create ids for the training data.
    train_ph_ids = [symbols_to_ids(line, ph_vocab) for line in train_ph]
    train_gr_ids = [symbols_to_ids(line, gr_vocab) for line in train_gr]
    valid_ph_ids = [symbols_to_ids(line, ph_vocab) for line in valid_ph]
    valid_gr_ids = [symbols_to_ids(line, gr_vocab) for line in valid_gr]

    return (train_gr_ids, train_ph_ids,
            valid_gr_ids, valid_ph_ids,
            gr_vocab, ph_vocab,
            test_dic)

### train

In [10]:
def main(FLAGS):
    """Main function.
    """
    with tf.Graph().as_default():
        if not FLAGS.model:
            raise RuntimeError("Model directory not specified.")
        g2p_model = G2PModel(FLAGS.model)
        if FLAGS.train:
            g2p_params = TrainingParams(FLAGS)
            g2p_model.prepare_data(FLAGS.train, FLAGS.valid, FLAGS.test)
            if (not os.path.exists(os.path.join(FLAGS.model,
                                                "model.data-00000-of-00001"))
                    or FLAGS.reinit):
                g2p_model.create_train_model(g2p_params)
            else:
                g2p_model.load_train_model(g2p_params)
            g2p_model.train()
        else:
            g2p_model.load_decode_model()
            if FLAGS.decode:
                decode_lines = codecs.open(
                    FLAGS.decode, "r", "utf-8").readlines()
                output_file = None
                if FLAGS.output:
                    output_file = codecs.open(FLAGS.output, "w", "utf-8")
                g2p_model.decode(decode_lines, output_file)
            elif FLAGS.interactive:
                g2p_model.interactive()
            elif FLAGS.evaluate:
                test_lines = codecs.open(
                    FLAGS.evaluate, "r", "utf-8").readlines()
                g2p_model.evaluate(test_lines)

### prepare

In [11]:
if not os.path.isfile("g2p-seq2seq-cmudict.tar.gz"):
    URL = "https://sourceforge.net/projects/cmusphinx/files/G2P%20Models/g2p-seq2seq-cmudict.tar.gz/download"
    !wget -O g2p-seq2seq-cmudict.tar.gz $URL
    !tar xf g2p-seq2seq-cmudict.tar.gz

In [12]:
FLAGS.interactive = True
FLAGS.model = "g2p-seq2seq-cmudict"

### run

In [13]:
main(FLAGS)

Loading vocabularies from g2p-seq2seq-cmudict
Creating 2 layers of 512 units.
Reading model parameters from g2p-seq2seq-cmudict
> hello
HH EH L OW
> model
M AA D AH L
> do it
Symbols 'b' '' are not in vocabulary

> do
D UW
> it
AH T
> hehehe
HH EH HH IY
> hohohohohhohohoh
HH AA HH OW HH OW
> exit
EH K S AH T
> exit()
Symbols 'b"(',')"' are not in vocabulary

> 
