# Google Neural Machine Translation

In this notebook, we are going to train Google NMT on IWSLT 2015 English-Vietnamese
Dataset. The building process includes four steps: 1) load and process dataset, 2)
create sampler and DataLoader, 3) build model, and 4) write training epochs.

## Load MXNET and Gluon

In [1]:
import warnings
warnings.filterwarnings('ignore')

import argparse
import time
import random
import os
import io
import logging
import numpy as np
import mxnet as mx
from mxnet import gluon
import gluonnlp as nlp
import nmt

## Hyper-parameters

In [2]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0)

# parameters for dataset
dataset = 'IWSLT2015'
src_lang, tgt_lang = 'en', 'vi'
src_max_len, tgt_max_len = 50, 50

# parameters for model
num_hidden = 512
num_layers = 2
num_bi_layers = 1
dropout = 0.2

# parameters for training
batch_size, test_batch_size = 128, 32
num_buckets = 5
epochs = 1
clip = 5
lr = 0.001
lr_update_factor = 0.5
log_interval = 10
save_dir = 'gnmt_en_vi_u512'

#parameters for testing
beam_size = 10
lp_alpha = 1.0
lp_k = 5

nmt.utils.logging_config(save_dir)

All Logs will be saved to gnmt_en_vi_u512/<ipython-input-2-4699ac3a1bfb>.log


'gnmt_en_vi_u512'

## Load and Preprocess Dataset

The following shows how to process the dataset and cache the processed dataset
for future use. The processing steps include: 1) clip the source and target
sequences, 2) split the string input to a list of tokens, 3) map the
string token into its integer index in the vocabulary, and 4) append end-of-sentence (EOS) token to source
sentence and add BOS and EOS tokens to target sentence.

In [3]:
def cache_dataset(dataset, prefix):
    """Cache the processed npy dataset  the dataset into a npz

    Parameters
    ----------
    dataset : gluon.data.SimpleDataset
    file_path : str
    """
    if not os.path.exists(nmt._constants.CACHE_PATH):
        os.makedirs(nmt._constants.CACHE_PATH)
    src_data = np.concatenate([e[0] for e in dataset])
    tgt_data = np.concatenate([e[1] for e in dataset])
    src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
    tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
    np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
             src_data=src_data, tgt_data=tgt_data,
             src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)


def load_cached_dataset(prefix):
    cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
    if os.path.exists(cached_file_path):
        print('Load cached data from {}'.format(cached_file_path))
        npz_data = np.load(cached_file_path)
        src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
                ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
        src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
        tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
        return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
    else:
        return None


class TrainValDataTransform(object):
    """Transform the machine translation dataset.

    Clip source and the target sentences to the maximum length. For the source sentence, append the
    EOS. For the target sentence, append BOS and EOS.

    Parameters
    ----------
    src_vocab : Vocab
    tgt_vocab : Vocab
    src_max_len : int
    tgt_max_len : int
    """
    def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
        self._src_vocab = src_vocab
        self._tgt_vocab = tgt_vocab
        self._src_max_len = src_max_len
        self._tgt_max_len = tgt_max_len

    def __call__(self, src, tgt):
        if self._src_max_len > 0:
            src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
        else:
            src_sentence = self._src_vocab[src.split()]
        if self._tgt_max_len > 0:
            tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
        else:
            tgt_sentence = self._tgt_vocab[tgt.split()]
        src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
        tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
        tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
        src_npy = np.array(src_sentence, dtype=np.int32)
        tgt_npy = np.array(tgt_sentence, dtype=np.int32)
        return src_npy, tgt_npy


def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
    start = time.time()
    dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
                                                                src_max_len,
                                                                tgt_max_len), lazy=False)
    end = time.time()
    print('Processing time spent: {}'.format(end - start))
    return dataset_processed


def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
    """Load translation dataset

    Parameters
    ----------
    dataset : str
    src_lang : str, default 'en'
    tgt_lang : str, default 'vi'

    Returns
    -------
    data_train_processed : Dataset
        The preprocessed training sentence pairs
    data_val_processed : Dataset
        The preprocessed validation sentence pairs
    data_test_processed : Dataset
        The preprocessed test sentence pairs
    val_tgt_sentences : list
        The target sentences in the validation set
    test_tgt_sentences : list
        The target sentences in the test set
    src_vocab : Vocab
        Vocabulary of the source language
    tgt_vocab : Vocab
        Vocabulary of the target language
    """
    common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
                                                   src_max_len, tgt_max_len)
    data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
    data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
    data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
    src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
    data_train_processed = load_cached_dataset(common_prefix + '_train')
    if not data_train_processed:
        data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
                                               src_max_len, tgt_max_len)
        cache_dataset(data_train_processed, common_prefix + '_train')
    data_val_processed = load_cached_dataset(common_prefix + '_val')
    if not data_val_processed:
        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
        cache_dataset(data_val_processed, common_prefix + '_val')
    data_test_processed = load_cached_dataset(common_prefix + '_test')
    if not data_test_processed:
        data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
        cache_dataset(data_test_processed, common_prefix + '_test')
    fetch_tgt_sentence = lambda src, tgt: tgt.split()
    val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
    test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
    return data_train_processed, data_val_processed, data_test_processed, \
           val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab


def get_data_lengths(dataset):
    return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))


data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
    = load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
data_train_lengths = get_data_lengths(data_train)
data_val_lengths = get_data_lengths(data_val)
data_test_lengths = get_data_lengths(data_test)

with io.open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
    for ele in val_tgt_sentences:
        of.write(' '.join(ele) + '\n')

with io.open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
    for ele in test_tgt_sentences:
        of.write(' '.join(ele) + '\n')


data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                                     for i, ele in enumerate(data_val)])
data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                                      for i, ele in enumerate(data_test)])

Downloading /var/lib/jenkins/workspace/gluon-nlp-gpu-py3@2/tests/data/datasets/iwslt2015/iwslt15.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/iwslt2015/iwslt15.zip...


Processing time spent: 5.375916004180908


Processing time spent: 0.06687569618225098
Processing time spent: 0.06173110008239746


## Create Sampler and DataLoader

Now, we have obtained `data_train`, `data_val`, and `data_test`. The next step
is to construct sampler and DataLoader. The first step is to construct batchify
function, which pads and stacks sequences to form mini-batch.

In [4]:
train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
                                            nlp.data.batchify.Pad(),
                                            nlp.data.batchify.Stack(dtype='float32'),
                                            nlp.data.batchify.Stack(dtype='float32'))
test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
                                           nlp.data.batchify.Pad(),
                                           nlp.data.batchify.Stack(dtype='float32'),
                                           nlp.data.batchify.Stack(dtype='float32'),
                                           nlp.data.batchify.Stack())

We can then construct bucketing samplers, which generate batches by grouping
sequences with similar lengths. Here, the bucketing scheme is empirically determined.

In [5]:
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
                                                  batch_size=batch_size,
                                                  num_buckets=num_buckets,
                                                  shuffle=True,
                                                  bucket_scheme=bucket_scheme)
logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
val_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_val_lengths,
                                                batch_size=test_batch_size,
                                                num_buckets=num_buckets,
                                                shuffle=False)
logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
test_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_test_lengths,
                                                 batch_size=test_batch_size,
                                                 num_buckets=num_buckets,
                                                 shuffle=False)
logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))

2019-06-27 00:07:01,404 - root - Train Batch Sampler:
FixedBucketSampler:
  sample_num=133166, batch_num=1043
  key=[(9, 10), (16, 17), (26, 27), (37, 38), (51, 52)]
  cnt=[11414, 34897, 37760, 23480, 25615]
  batch_size=[128, 128, 128, 128, 128]


2019-06-27 00:07:01,407 - root - Valid Batch Sampler:
FixedBucketSampler:
  sample_num=1553, batch_num=52
  key=[(22, 28), (40, 52), (58, 76), (76, 100), (94, 124)]
  cnt=[1037, 432, 67, 10, 7]
  batch_size=[32, 32, 32, 32, 32]


2019-06-27 00:07:01,410 - root - Test Batch Sampler:
FixedBucketSampler:
  sample_num=1268, batch_num=42
  key=[(23, 29), (43, 53), (63, 77), (83, 101), (103, 125)]
  cnt=[770, 381, 84, 26, 7]
  batch_size=[32, 32, 32, 32, 32]


Given the samplers, we can create DataLoader, which is iterable.

In [6]:
train_data_loader = gluon.data.DataLoader(data_train,
                                          batch_sampler=train_batch_sampler,
                                          batchify_fn=train_batchify_fn,
                                          num_workers=4)
val_data_loader = gluon.data.DataLoader(data_val,
                                        batch_sampler=val_batch_sampler,
                                        batchify_fn=test_batchify_fn,
                                        num_workers=4)
test_data_loader = gluon.data.DataLoader(data_test,
                                         batch_sampler=test_batch_sampler,
                                         batchify_fn=test_batchify_fn,
                                         num_workers=4)

## Build GNMT Model

After obtaining DataLoader, we can build the model. The GNMT encoder and decoder
can be easily constructed by calling `get_gnmt_encoder_decoder` function. Then, we
feed the encoder and decoder to `NMTModel` to construct the GNMT model.
`model.hybridize` allows computation to be done using the symbolic backend.

In [7]:
encoder, decoder = nmt.gnmt.get_gnmt_encoder_decoder(hidden_size=num_hidden,
                                                     dropout=dropout,
                                                     num_layers=num_layers,
                                                     num_bi_layers=num_bi_layers)
model = nlp.model.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder,
                                       decoder=decoder, embed_size=num_hidden, prefix='gnmt_')
model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
static_alloc = True
model.hybridize(static_alloc=static_alloc)
logging.info(model)

# Due to the paddings, we need to mask out the losses corresponding to padding tokens.
loss_function = nlp.loss.MaskedSoftmaxCELoss()
loss_function.hybridize(static_alloc=static_alloc)

2019-06-27 00:07:10,628 - root - NMTModel(
  (encoder): GNMTEncoder(
    (dropout_layer): Dropout(p = 0.2, axes=())
    (rnn_cells): HybridSequential(
      (0): BidirectionalCell(forward=LSTMCell(None -> 2048), backward=LSTMCell(None -> 2048))
      (1): LSTMCell(None -> 2048)
    )
  )
  (decoder): GNMTDecoder(
    (attention_cell): DotProductAttentionCell(
      (_dropout_layer): Dropout(p = 0.0, axes=())
      (_proj_query): Dense(None -> 512, linear)
    )
    (dropout_layer): Dropout(p = 0.2, axes=())
    (rnn_cells): HybridSequential(
      (0): LSTMCell(None -> 2048)
      (1): LSTMCell(None -> 2048)
    )
  )
  (src_embed): HybridSequential(
    (0): Embedding(17191 -> 512, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (tgt_embed): HybridSequential(
    (0): Embedding(7709 -> 512, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (tgt_proj): Dense(None -> 7709, linear)
)


We also build the beam search translator.

In [8]:
translator = nmt.translation.BeamSearchTranslator(model=model, beam_size=beam_size,
                                                  scorer=nlp.model.BeamSearchScorer(alpha=lp_alpha,
                                                                                    K=lp_k),
                                                  max_length=tgt_max_len + 100)
logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))

2019-06-27 00:07:10,726 - root - Use beam_size=10, alpha=1.0, K=5


We define evaluation function as follows. The `evaluate` function use beam
search translator to generate outputs for the validation and testing datasets.

In [9]:
def evaluate(data_loader):
    """Evaluate given the data loader

    Parameters
    ----------
    data_loader : gluon.data.DataLoader

    Returns
    -------
    avg_loss : float
        Average loss
    real_translation_out : list of list of str
        The translation output
    """
    translation_out = []
    all_inst_ids = []
    avg_loss_denom = 0
    avg_loss = 0.0
    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
            in enumerate(data_loader):
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        # Calculating Loss
        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
        loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
        avg_loss += loss * (tgt_seq.shape[1] - 1)
        avg_loss_denom += (tgt_seq.shape[1] - 1)
        # Translate
        samples, _, sample_valid_length =\
            translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
        max_score_sample = samples[:, 0, :].asnumpy()
        sample_valid_length = sample_valid_length[:, 0].asnumpy()
        for i in range(max_score_sample.shape[0]):
            translation_out.append(
                [tgt_vocab.idx_to_token[ele] for ele in
                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
    avg_loss = avg_loss / avg_loss_denom
    real_translation_out = [None for _ in range(len(all_inst_ids))]
    for ind, sentence in zip(all_inst_ids, translation_out):
        real_translation_out[ind] = sentence
    return avg_loss, real_translation_out


def write_sentences(sentences, file_path):
    with io.open(file_path, 'w', encoding='utf-8') as of:
        for sent in sentences:
            of.write(' '.join(sent) + '\n')

## Training Epochs

Before entering the training stage, we need to create trainer for updating the
parameters. In the following example, we create a trainer that uses ADAM
optimzier.

In [10]:
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})

We can then write the training loop. During the training, we evaluate on the validation and testing datasets every epoch, and record the
parameters that give the hightest BLEU score on the validation dataset. Before
performing forward and backward, we first use `as_in_context` function to copy
the mini-batch to GPU. The statement `with mx.autograd.record()` tells Gluon
backend to compute the gradients for the part inside the block.

In [11]:
best_valid_bleu = 0.0
for epoch_id in range(epochs):
    log_avg_loss = 0
    log_avg_gnorm = 0
    log_wc = 0
    log_start_time = time.time()
    for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
            in enumerate(train_data_loader):
        # logging.info(src_seq.context) Context suddenly becomes GPU.
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        with mx.autograd.record():
            out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
            loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
            loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
            loss.backward()
        grads = [p.grad(ctx) for p in model.collect_params().values()]
        gnorm = gluon.utils.clip_global_norm(grads, clip)
        trainer.step(1)
        src_wc = src_valid_length.sum().asscalar()
        tgt_wc = (tgt_valid_length - 1).sum().asscalar()
        step_loss = loss.asscalar()
        log_avg_loss += step_loss
        log_avg_gnorm += gnorm
        log_wc += src_wc + tgt_wc
        if (batch_id + 1) % log_interval == 0:
            wps = log_wc / (time.time() - log_start_time)
            logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
                         'throughput={:.2f}K wps, wc={:.2f}K'
                         .format(epoch_id, batch_id + 1, len(train_data_loader),
                                 log_avg_loss / log_interval,
                                 np.exp(log_avg_loss / log_interval),
                                 log_avg_gnorm / log_interval,
                                 wps / 1000, log_wc / 1000))
            log_start_time = time.time()
            log_avg_loss = 0
            log_avg_gnorm = 0
            log_wc = 0
    valid_loss, valid_translation_out = evaluate(val_data_loader)
    valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
    logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
                 .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
    test_loss, test_translation_out = evaluate(test_data_loader)
    test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
    logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
                 .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
    write_sentences(valid_translation_out,
                    os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
    write_sentences(test_translation_out,
                    os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
    if valid_bleu_score > best_valid_bleu:
        best_valid_bleu = valid_bleu_score
        save_path = os.path.join(save_dir, 'valid_best.params')
        logging.info('Save best parameters to {}'.format(save_path))
        model.save_parameters(save_path)
    if epoch_id + 1 >= (epochs * 2) // 3:
        new_lr = trainer.learning_rate * lr_update_factor
        logging.info('Learning rate change to {}'.format(new_lr))
        trainer.set_learning_rate(new_lr)

2019-06-27 00:07:15,877 - root - [Epoch 0 Batch 10/1043] loss=7.7375, ppl=2292.6586, gnorm=1.4907, throughput=10.79K wps, wc=54.27K


2019-06-27 00:07:18,550 - root - [Epoch 0 Batch 20/1043] loss=6.3590, ppl=577.6408, gnorm=1.5744, throughput=20.18K wps, wc=50.20K


2019-06-27 00:07:21,838 - root - [Epoch 0 Batch 30/1043] loss=6.3708, ppl=584.5346, gnorm=0.8043, throughput=20.65K wps, wc=67.78K


2019-06-27 00:07:24,862 - root - [Epoch 0 Batch 40/1043] loss=6.1792, ppl=482.5820, gnorm=0.6213, throughput=20.91K wps, wc=63.19K


2019-06-27 00:07:27,934 - root - [Epoch 0 Batch 50/1043] loss=6.1871, ppl=486.4455, gnorm=0.4034, throughput=20.18K wps, wc=61.93K


2019-06-27 00:07:30,781 - root - [Epoch 0 Batch 60/1043] loss=6.1060, ppl=448.5210, gnorm=0.6787, throughput=20.81K wps, wc=59.19K


2019-06-27 00:07:34,186 - root - [Epoch 0 Batch 70/1043] loss=6.1569, ppl=471.9777, gnorm=0.4684, throughput=21.45K wps, wc=72.99K


2019-06-27 00:07:37,342 - root - [Epoch 0 Batch 80/1043] loss=6.0705, ppl=432.9092, gnorm=0.4163, throughput=20.49K wps, wc=64.58K


2019-06-27 00:07:39,998 - root - [Epoch 0 Batch 90/1043] loss=5.9389, ppl=379.4997, gnorm=0.3626, throughput=20.00K wps, wc=53.02K


2019-06-27 00:07:42,745 - root - [Epoch 0 Batch 100/1043] loss=5.8775, ppl=356.9234, gnorm=0.4090, throughput=21.65K wps, wc=59.42K


2019-06-27 00:07:45,672 - root - [Epoch 0 Batch 110/1043] loss=5.8723, ppl=355.0713, gnorm=0.3617, throughput=22.40K wps, wc=65.50K


2019-06-27 00:07:48,628 - root - [Epoch 0 Batch 120/1043] loss=5.8694, ppl=354.0462, gnorm=0.3479, throughput=19.78K wps, wc=58.43K


2019-06-27 00:07:52,035 - root - [Epoch 0 Batch 130/1043] loss=5.9339, ppl=377.6277, gnorm=0.3579, throughput=17.45K wps, wc=59.39K


2019-06-27 00:07:54,998 - root - [Epoch 0 Batch 140/1043] loss=5.8808, ppl=358.1109, gnorm=0.2855, throughput=20.67K wps, wc=61.18K


2019-06-27 00:07:57,846 - root - [Epoch 0 Batch 150/1043] loss=5.8124, ppl=334.4147, gnorm=0.2918, throughput=19.81K wps, wc=56.34K


2019-06-27 00:08:00,927 - root - [Epoch 0 Batch 160/1043] loss=5.7398, ppl=311.0072, gnorm=0.3737, throughput=18.82K wps, wc=57.93K


2019-06-27 00:08:03,896 - root - [Epoch 0 Batch 170/1043] loss=5.7608, ppl=317.6178, gnorm=0.3004, throughput=21.72K wps, wc=64.42K


2019-06-27 00:08:06,147 - root - [Epoch 0 Batch 180/1043] loss=5.4566, ppl=234.2920, gnorm=0.3406, throughput=19.71K wps, wc=44.31K


2019-06-27 00:08:09,172 - root - [Epoch 0 Batch 190/1043] loss=5.6080, ppl=272.6107, gnorm=0.3578, throughput=20.66K wps, wc=62.45K


2019-06-27 00:08:11,945 - root - [Epoch 0 Batch 200/1043] loss=5.5012, ppl=244.9762, gnorm=0.3381, throughput=19.59K wps, wc=54.24K


2019-06-27 00:08:14,550 - root - [Epoch 0 Batch 210/1043] loss=5.3199, ppl=204.3678, gnorm=0.4236, throughput=20.24K wps, wc=52.68K


2019-06-27 00:08:16,975 - root - [Epoch 0 Batch 220/1043] loss=5.3090, ppl=202.1534, gnorm=0.3807, throughput=20.85K wps, wc=50.47K


2019-06-27 00:08:20,035 - root - [Epoch 0 Batch 230/1043] loss=5.3090, ppl=202.1443, gnorm=0.4230, throughput=20.16K wps, wc=61.65K


2019-06-27 00:08:22,727 - root - [Epoch 0 Batch 240/1043] loss=5.3017, ppl=200.6791, gnorm=0.3476, throughput=21.69K wps, wc=58.32K


2019-06-27 00:08:26,094 - root - [Epoch 0 Batch 250/1043] loss=5.4215, ppl=226.2137, gnorm=0.2882, throughput=21.06K wps, wc=70.84K


2019-06-27 00:08:28,930 - root - [Epoch 0 Batch 260/1043] loss=5.2739, ppl=195.1772, gnorm=0.3596, throughput=21.26K wps, wc=60.22K


2019-06-27 00:08:32,152 - root - [Epoch 0 Batch 270/1043] loss=5.3786, ppl=216.7152, gnorm=0.2841, throughput=22.66K wps, wc=72.96K


2019-06-27 00:08:35,199 - root - [Epoch 0 Batch 280/1043] loss=5.2528, ppl=191.0951, gnorm=0.2658, throughput=19.99K wps, wc=60.80K


2019-06-27 00:08:37,725 - root - [Epoch 0 Batch 290/1043] loss=4.9769, ppl=145.0176, gnorm=0.3470, throughput=18.15K wps, wc=45.79K


2019-06-27 00:08:40,539 - root - [Epoch 0 Batch 300/1043] loss=5.0869, ppl=161.8902, gnorm=0.3500, throughput=21.01K wps, wc=59.05K


2019-06-27 00:08:43,563 - root - [Epoch 0 Batch 310/1043] loss=5.1068, ppl=165.1394, gnorm=0.3009, throughput=20.39K wps, wc=61.58K


2019-06-27 00:08:46,201 - root - [Epoch 0 Batch 320/1043] loss=4.9811, ppl=145.6305, gnorm=0.3154, throughput=20.16K wps, wc=53.10K


2019-06-27 00:08:49,139 - root - [Epoch 0 Batch 330/1043] loss=5.0422, ppl=154.8146, gnorm=0.3015, throughput=20.94K wps, wc=61.37K


2019-06-27 00:08:51,977 - root - [Epoch 0 Batch 340/1043] loss=5.0655, ppl=158.4552, gnorm=0.2632, throughput=20.06K wps, wc=56.88K


2019-06-27 00:08:54,645 - root - [Epoch 0 Batch 350/1043] loss=4.9246, ppl=137.6404, gnorm=0.3163, throughput=20.60K wps, wc=54.86K


2019-06-27 00:08:57,510 - root - [Epoch 0 Batch 360/1043] loss=5.0421, ppl=154.7988, gnorm=0.2850, throughput=22.55K wps, wc=64.55K


2019-06-27 00:09:00,589 - root - [Epoch 0 Batch 370/1043] loss=4.9064, ppl=135.1580, gnorm=0.2956, throughput=21.78K wps, wc=66.97K


2019-06-27 00:09:03,343 - root - [Epoch 0 Batch 380/1043] loss=4.8172, ppl=123.6220, gnorm=0.3044, throughput=19.21K wps, wc=52.79K


2019-06-27 00:09:05,976 - root - [Epoch 0 Batch 390/1043] loss=4.7893, ppl=120.2213, gnorm=0.3350, throughput=19.36K wps, wc=50.94K


2019-06-27 00:09:08,424 - root - [Epoch 0 Batch 400/1043] loss=4.6555, ppl=105.1638, gnorm=0.3907, throughput=19.72K wps, wc=48.22K


2019-06-27 00:09:10,901 - root - [Epoch 0 Batch 410/1043] loss=4.8339, ppl=125.6981, gnorm=0.3242, throughput=19.51K wps, wc=48.27K


2019-06-27 00:09:13,765 - root - [Epoch 0 Batch 420/1043] loss=4.8519, ppl=127.9888, gnorm=0.3055, throughput=19.62K wps, wc=56.14K


2019-06-27 00:09:17,052 - root - [Epoch 0 Batch 430/1043] loss=4.8984, ppl=134.0705, gnorm=0.2895, throughput=21.11K wps, wc=69.33K


2019-06-27 00:09:20,013 - root - [Epoch 0 Batch 440/1043] loss=4.7876, ppl=120.0098, gnorm=0.2734, throughput=22.68K wps, wc=67.08K


2019-06-27 00:09:22,586 - root - [Epoch 0 Batch 450/1043] loss=4.6916, ppl=109.0252, gnorm=0.3080, throughput=20.88K wps, wc=53.68K


2019-06-27 00:09:25,063 - root - [Epoch 0 Batch 460/1043] loss=4.4769, ppl=87.9586, gnorm=0.3489, throughput=20.38K wps, wc=50.38K


2019-06-27 00:09:28,053 - root - [Epoch 0 Batch 470/1043] loss=4.7561, ppl=116.2918, gnorm=0.2942, throughput=20.32K wps, wc=60.70K


2019-06-27 00:09:30,475 - root - [Epoch 0 Batch 480/1043] loss=4.3967, ppl=81.1856, gnorm=0.3453, throughput=22.34K wps, wc=54.04K


2019-06-27 00:09:32,701 - root - [Epoch 0 Batch 490/1043] loss=4.5094, ppl=90.8703, gnorm=0.4202, throughput=20.83K wps, wc=46.32K


2019-06-27 00:09:35,108 - root - [Epoch 0 Batch 500/1043] loss=4.5656, ppl=96.1205, gnorm=0.3105, throughput=20.18K wps, wc=48.55K


2019-06-27 00:09:37,246 - root - [Epoch 0 Batch 510/1043] loss=4.3288, ppl=75.8559, gnorm=0.3366, throughput=19.49K wps, wc=41.62K


2019-06-27 00:09:39,319 - root - [Epoch 0 Batch 520/1043] loss=4.2394, ppl=69.3683, gnorm=0.3773, throughput=19.31K wps, wc=39.98K


2019-06-27 00:09:42,146 - root - [Epoch 0 Batch 530/1043] loss=4.6554, ppl=105.1503, gnorm=0.3106, throughput=20.74K wps, wc=58.58K


2019-06-27 00:09:44,566 - root - [Epoch 0 Batch 540/1043] loss=4.5210, ppl=91.9245, gnorm=0.3208, throughput=20.97K wps, wc=50.72K


2019-06-27 00:09:47,482 - root - [Epoch 0 Batch 550/1043] loss=4.5447, ppl=94.1324, gnorm=0.3214, throughput=21.61K wps, wc=62.95K


2019-06-27 00:09:49,749 - root - [Epoch 0 Batch 560/1043] loss=4.3290, ppl=75.8656, gnorm=0.3394, throughput=20.38K wps, wc=46.13K


2019-06-27 00:09:52,521 - root - [Epoch 0 Batch 570/1043] loss=4.3493, ppl=77.4252, gnorm=0.3009, throughput=22.07K wps, wc=61.12K


2019-06-27 00:09:55,029 - root - [Epoch 0 Batch 580/1043] loss=4.3643, ppl=78.5975, gnorm=0.3134, throughput=22.13K wps, wc=55.43K


2019-06-27 00:09:58,126 - root - [Epoch 0 Batch 590/1043] loss=4.5888, ppl=98.3770, gnorm=0.2748, throughput=23.89K wps, wc=73.93K


2019-06-27 00:10:00,659 - root - [Epoch 0 Batch 600/1043] loss=4.4959, ppl=89.6519, gnorm=0.2772, throughput=22.04K wps, wc=55.80K


2019-06-27 00:10:03,092 - root - [Epoch 0 Batch 610/1043] loss=4.3174, ppl=74.9904, gnorm=0.3306, throughput=21.51K wps, wc=52.28K


2019-06-27 00:10:06,252 - root - [Epoch 0 Batch 620/1043] loss=4.5597, ppl=95.5579, gnorm=0.2581, throughput=22.92K wps, wc=72.39K


2019-06-27 00:10:08,034 - root - [Epoch 0 Batch 630/1043] loss=4.0495, ppl=57.3701, gnorm=0.3413, throughput=19.34K wps, wc=34.44K


2019-06-27 00:10:10,518 - root - [Epoch 0 Batch 640/1043] loss=4.3356, ppl=76.3704, gnorm=0.3157, throughput=23.10K wps, wc=57.32K


2019-06-27 00:10:13,542 - root - [Epoch 0 Batch 650/1043] loss=4.4767, ppl=87.9457, gnorm=0.2783, throughput=21.99K wps, wc=66.42K


2019-06-27 00:10:15,669 - root - [Epoch 0 Batch 660/1043] loss=4.1965, ppl=66.4513, gnorm=0.3497, throughput=20.90K wps, wc=44.42K


2019-06-27 00:10:18,980 - root - [Epoch 0 Batch 670/1043] loss=4.6035, ppl=99.8329, gnorm=0.2592, throughput=23.48K wps, wc=77.68K


2019-06-27 00:10:21,669 - root - [Epoch 0 Batch 680/1043] loss=4.4282, ppl=83.7837, gnorm=0.2850, throughput=21.97K wps, wc=59.02K


2019-06-27 00:10:23,997 - root - [Epoch 0 Batch 690/1043] loss=4.2100, ppl=67.3599, gnorm=0.3212, throughput=21.74K wps, wc=50.54K


2019-06-27 00:10:26,418 - root - [Epoch 0 Batch 700/1043] loss=4.2986, ppl=73.5946, gnorm=0.3053, throughput=21.69K wps, wc=52.45K


2019-06-27 00:10:28,419 - root - [Epoch 0 Batch 710/1043] loss=4.1867, ppl=65.8024, gnorm=0.3388, throughput=20.67K wps, wc=41.32K


2019-06-27 00:10:30,725 - root - [Epoch 0 Batch 720/1043] loss=4.2639, ppl=71.0879, gnorm=0.3010, throughput=21.74K wps, wc=50.09K


2019-06-27 00:10:33,270 - root - [Epoch 0 Batch 730/1043] loss=4.1942, ppl=66.3012, gnorm=0.3244, throughput=20.47K wps, wc=52.02K


2019-06-27 00:10:36,032 - root - [Epoch 0 Batch 740/1043] loss=4.3311, ppl=76.0264, gnorm=0.2705, throughput=21.69K wps, wc=59.86K


2019-06-27 00:10:38,592 - root - [Epoch 0 Batch 750/1043] loss=4.1912, ppl=66.0998, gnorm=0.2909, throughput=20.87K wps, wc=53.35K


2019-06-27 00:10:41,619 - root - [Epoch 0 Batch 760/1043] loss=4.2881, ppl=72.8249, gnorm=0.2956, throughput=23.18K wps, wc=70.09K


2019-06-27 00:10:43,929 - root - [Epoch 0 Batch 770/1043] loss=4.1163, ppl=61.3322, gnorm=0.3184, throughput=18.73K wps, wc=43.21K


2019-06-27 00:10:47,341 - root - [Epoch 0 Batch 780/1043] loss=4.3126, ppl=74.6316, gnorm=0.2868, throughput=22.26K wps, wc=75.93K


2019-06-27 00:10:49,706 - root - [Epoch 0 Batch 790/1043] loss=4.1421, ppl=62.9344, gnorm=0.3124, throughput=19.83K wps, wc=46.81K


2019-06-27 00:10:52,456 - root - [Epoch 0 Batch 800/1043] loss=4.2065, ppl=67.1220, gnorm=0.3203, throughput=21.38K wps, wc=58.72K


2019-06-27 00:10:55,154 - root - [Epoch 0 Batch 810/1043] loss=4.0828, ppl=59.3103, gnorm=0.3072, throughput=21.30K wps, wc=57.38K


2019-06-27 00:10:57,881 - root - [Epoch 0 Batch 820/1043] loss=3.9687, ppl=52.9133, gnorm=0.3280, throughput=21.48K wps, wc=58.52K


2019-06-27 00:11:00,641 - root - [Epoch 0 Batch 830/1043] loss=4.1222, ppl=61.6933, gnorm=0.3378, throughput=20.76K wps, wc=57.24K


2019-06-27 00:11:03,151 - root - [Epoch 0 Batch 840/1043] loss=4.0870, ppl=59.5630, gnorm=0.3098, throughput=20.97K wps, wc=52.57K


2019-06-27 00:11:06,283 - root - [Epoch 0 Batch 850/1043] loss=4.1362, ppl=62.5645, gnorm=0.2948, throughput=20.49K wps, wc=64.14K


2019-06-27 00:11:08,865 - root - [Epoch 0 Batch 860/1043] loss=4.1289, ppl=62.1117, gnorm=0.2949, throughput=21.20K wps, wc=54.64K


2019-06-27 00:11:11,793 - root - [Epoch 0 Batch 870/1043] loss=4.1557, ppl=63.7995, gnorm=0.3035, throughput=22.50K wps, wc=65.81K


2019-06-27 00:11:14,319 - root - [Epoch 0 Batch 880/1043] loss=4.0874, ppl=59.5845, gnorm=0.2924, throughput=19.94K wps, wc=50.27K


2019-06-27 00:11:16,907 - root - [Epoch 0 Batch 890/1043] loss=4.1518, ppl=63.5505, gnorm=0.2808, throughput=21.70K wps, wc=56.11K


2019-06-27 00:11:19,768 - root - [Epoch 0 Batch 900/1043] loss=4.1750, ppl=65.0426, gnorm=0.2873, throughput=21.32K wps, wc=60.91K


2019-06-27 00:11:22,188 - root - [Epoch 0 Batch 910/1043] loss=3.9803, ppl=53.5334, gnorm=0.3152, throughput=21.36K wps, wc=51.65K


2019-06-27 00:11:24,957 - root - [Epoch 0 Batch 920/1043] loss=4.1676, ppl=64.5620, gnorm=0.2911, throughput=21.89K wps, wc=60.52K


2019-06-27 00:11:27,247 - root - [Epoch 0 Batch 930/1043] loss=3.9696, ppl=52.9634, gnorm=0.3005, throughput=19.03K wps, wc=43.51K


2019-06-27 00:11:29,616 - root - [Epoch 0 Batch 940/1043] loss=3.9258, ppl=50.6934, gnorm=0.3263, throughput=21.00K wps, wc=49.71K


2019-06-27 00:11:32,727 - root - [Epoch 0 Batch 950/1043] loss=4.2013, ppl=66.7743, gnorm=0.2856, throughput=22.81K wps, wc=70.92K


2019-06-27 00:11:36,016 - root - [Epoch 0 Batch 960/1043] loss=4.1524, ppl=63.5859, gnorm=0.2733, throughput=22.53K wps, wc=74.06K


2019-06-27 00:11:38,165 - root - [Epoch 0 Batch 970/1043] loss=3.8736, ppl=48.1166, gnorm=0.3670, throughput=20.51K wps, wc=44.03K


2019-06-27 00:11:41,501 - root - [Epoch 0 Batch 980/1043] loss=4.1779, ppl=65.2289, gnorm=0.2841, throughput=21.81K wps, wc=72.73K


2019-06-27 00:11:44,488 - root - [Epoch 0 Batch 990/1043] loss=4.0953, ppl=60.0574, gnorm=0.3038, throughput=21.91K wps, wc=65.39K


2019-06-27 00:11:56,178 - root - [Epoch 0 Batch 1000/1043] loss=3.9916, ppl=54.1427, gnorm=0.3146, throughput=4.76K wps, wc=55.65K


2019-06-27 00:11:59,350 - root - [Epoch 0 Batch 1010/1043] loss=4.0738, ppl=58.7818, gnorm=0.2859, throughput=20.71K wps, wc=65.61K


2019-06-27 00:12:01,588 - root - [Epoch 0 Batch 1020/1043] loss=3.8215, ppl=45.6737, gnorm=0.3306, throughput=20.45K wps, wc=45.72K


2019-06-27 00:12:04,313 - root - [Epoch 0 Batch 1030/1043] loss=3.9710, ppl=53.0402, gnorm=0.2997, throughput=20.56K wps, wc=55.97K


2019-06-27 00:12:06,833 - root - [Epoch 0 Batch 1040/1043] loss=3.9365, ppl=51.2394, gnorm=0.3537, throughput=21.16K wps, wc=53.27K


2019-06-27 00:12:43,896 - root - [Epoch 0] valid Loss=2.8391, valid ppl=17.1007, valid bleu=3.03


2019-06-27 00:13:05,255 - root - [Epoch 0] test Loss=2.9800, test ppl=19.6873, test bleu=2.94


2019-06-27 00:13:05,269 - root - Save best parameters to gnmt_en_vi_u512/valid_best.params


2019-06-27 00:13:05,575 - root - Learning rate change to 0.0005


## Summary
In this notebook, we have shown how to train a GNMT model on IWSLT 2015 English-Vietnamese using Gluon NLP toolkit.
The complete training script can be found [here](https://github.com/dmlc/gluon-nlp/blob/master/scripts/machine_translation/train_gnmt.py).
The command to reproduce the result can be seen in the [machine translation page](http://gluon-nlp.mxnet.io/model_zoo/machine_translation/index.html).