# Google NMT on IWSLT 2015 English-Vietnamese Translation

In this notebook, we going to train Google NMT on IWSLT 2015 English-Vietnamese Dataset. The building prcoess includes: 1) load and process dataset and 2) create sampler and DataLoader and 3) build model and 4) write training epochs.

## Load MXNET and Gluon

In [1]:
import argparse
import time
import random
import os
import io
import logging
import numpy as np
import mxnet as mx
from mxnet import gluon
from mxnet.gluon.data import ArrayDataset, SimpleDataset
from mxnet.gluon.data import DataLoader
import gluonnlp.data.batchify as btf
from gluonnlp.data import ExpWidthBucket, FixedBucketSampler, IWSLT2015
from gluonnlp.model import BeamSearchScorer
from scripts.nmt.gnmt import get_gnmt_encoder_decoder
from scripts.nmt.translation import NMTModel, BeamSearchTranslator
from scripts.nmt.loss import SoftmaxCEMaskedLoss
from scripts.nmt.utils import logging_config
from scripts.nmt.bleu import compute_bleu
import scripts.nmt._constants as _C

## Hyper-parameters

In [2]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0)

# parameters for dataset
dataset = 'IWSLT2015'
src_lang = 'en'
tgt_lang = 'vi'
src_max_len = 50
tgt_max_len = 50

# parameters for model
num_hidden = 512
num_layers = 2
num_bi_layers = 1
dropout = 0.2

# parameters for training
batch_size = 128
test_batch_size = 32
num_buckets = 5
epochs = 2
clip = 5
lr = 0.001
lr_update_factor = 0.5
log_interval = 10
save_dir = 'gnmt_en_vi_u512'

#parameters for testing
beam_size = 10
lp_alpha = 1.0
lp_k = 5

logging_config(save_dir)

All Logs will be saved to gnmt_en_vi_u512/<ipython-input-2-c89b0965bf79>.log


'gnmt_en_vi_u512'

## Load and Preprocess Dataset 

The following shows how to process the dataset and cache the processed dataset for the future use. The processing steps include: 1) clip the source and target sequences and 2) split the string input to a list of tokens and 3) map the string token into its index in the vocabulary and 4) append EOS token to source sentence and add BOS and EOS tokens to target sentence.

In [3]:
def cache_dataset(dataset, prefix):
    """Cache the processed npy dataset  the dataset into a npz

    Parameters
    ----------
    dataset : SimpleDataset
    file_path : str
    """
    if not os.path.exists(_C.CACHE_PATH):
        os.makedirs(_C.CACHE_PATH)
    src_data = np.array([ele[0] for ele in dataset])
    tgt_data = np.array([ele[1] for ele in dataset])
    np.savez(os.path.join(_C.CACHE_PATH, prefix + '.npz'), src_data=src_data, tgt_data=tgt_data)


def load_cached_dataset(prefix):
    cached_file_path = os.path.join(_C.CACHE_PATH, prefix + '.npz')
    if os.path.exists(cached_file_path):
        print('Load cached data from {}'.format(cached_file_path))
        dat = np.load(cached_file_path)
        return ArrayDataset(np.array(dat['src_data']), np.array(dat['tgt_data']))
    else:
        return None


class TrainValDataTransform(object):
    """Transform the machine translation dataset.

    Clip source and the target sentences to the maximum length. For the source sentence, append the
    EOS. For the target sentence, append BOS and EOS.

    Parameters
    ----------
    src_vocab : Vocab
    tgt_vocab : Vocab
    src_max_len : int
    tgt_max_len : int
    """
    def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
        self._src_vocab = src_vocab
        self._tgt_vocab = tgt_vocab
        self._src_max_len = src_max_len
        self._tgt_max_len = tgt_max_len

    def __call__(self, src, tgt):
        if self._src_max_len > 0:
            src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
        else:
            src_sentence = self._src_vocab[src.split()]
        if self._tgt_max_len > 0:
            tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
        else:
            tgt_sentence = self._tgt_vocab[tgt.split()]
        src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
        tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
        tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
        src_npy = np.array(src_sentence, dtype=np.int32)
        tgt_npy = np.array(tgt_sentence, dtype=np.int32)
        return src_npy, tgt_npy


def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
    start = time.time()
    dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
                                                                src_max_len,
                                                                tgt_max_len), lazy=False)
    end = time.time()
    print('Processing Time spent: {}'.format(end - start))
    return dataset_processed


def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
    """Load translation dataset

    Parameters
    ----------
    dataset : str
    src_lang : str, default 'en'
    tgt_lang : str, default 'vi'

    Returns
    -------

    """
    common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
                                                   src_max_len, tgt_max_len)
    data_train = IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
    data_val = IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
    data_test = IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
    src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
    data_train_processed = load_cached_dataset(common_prefix + '_train')
    if not data_train_processed:
        data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
                                               src_max_len, tgt_max_len)
        cache_dataset(data_train_processed, common_prefix + '_train')
    data_val_processed = load_cached_dataset(common_prefix + '_val')
    if not data_val_processed:
        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
        cache_dataset(data_val_processed, common_prefix + '_val')
    data_test_processed = load_cached_dataset(common_prefix + '_test')
    if not data_test_processed:
        data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
        cache_dataset(data_test_processed, common_prefix + '_test')
    fetch_tgt_sentence = lambda src, tgt: tgt.split()
    val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
    test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
    return data_train_processed, data_val_processed, data_test_processed, \
           val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab


def get_data_lengths(dataset):
    return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))


data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
    = load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
data_train_lengths = get_data_lengths(data_train)
data_val_lengths = get_data_lengths(data_val)
data_test_lengths = get_data_lengths(data_test)

with io.open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
    for ele in val_tgt_sentences:
        of.write(' '.join(ele) + '\n')

with io.open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
    for ele in test_tgt_sentences:
        of.write(' '.join(ele) + '\n')


data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
data_val = SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                          for i, ele in enumerate(data_val)])
data_test = SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
                           for i, ele in enumerate(data_test)])

Load cached data from /home/ubuntu/gluon-nlp-1/scripts/nmt/cached/IWSLT2015_en_vi_50_50_train.npz
Load cached data from /home/ubuntu/gluon-nlp-1/scripts/nmt/cached/IWSLT2015_en_vi_50_50_val.npz
Load cached data from /home/ubuntu/gluon-nlp-1/scripts/nmt/cached/IWSLT2015_en_vi_50_50_test.npz


## Create Sampler and DataLoader

Now, we have obtained `data_train`, `data_val`, and `data_test`. The next step is to construct sampler and DataLoader. The first step is to construct batchify function, which pads and stacks sequences to form mini-batch.

In [4]:
train_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(),
                              btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
test_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(),
                             btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
                             btf.Stack())

We can then construct bucketing samplers, which generate batches by grouping sequences with similar lengths.

In [5]:
train_batch_sampler = FixedBucketSampler(lengths=data_train_lengths,
                                             batch_size=batch_size,
                                             num_buckets=num_buckets,
                                             shuffle=True,
                                             bucket_scheme=ExpWidthBucket(bucket_len_step=1.2))
logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
val_batch_sampler = FixedBucketSampler(lengths=data_val_lengths,
                                       batch_size=test_batch_size,
                                       num_buckets=num_buckets,
                                       shuffle=False)
logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
test_batch_sampler = FixedBucketSampler(lengths=data_test_lengths,
                                        batch_size=test_batch_size,
                                        num_buckets=num_buckets,
                                        shuffle=False)
logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))

2018-08-09 00:35:50,872 - root - Train Batch Sampler:
FixedBucketSampler:
  sample_num=133166, batch_num=1043
  key=[(9, 10), (16, 17), (26, 27), (37, 38), (51, 52)]
  cnt=[11414, 34897, 37760, 23480, 25615]
  batch_size=[128, 128, 128, 128, 128]
2018-08-09 00:35:50,875 - root - Valid Batch Sampler:
FixedBucketSampler:
  sample_num=1553, batch_num=51
  key=[(15, 16), (24, 25), (33, 34), (42, 43), (51, 52)]
  cnt=[511, 484, 266, 153, 139]
  batch_size=[32, 32, 32, 32, 32]
2018-08-09 00:35:50,877 - root - Test Batch Sampler:
FixedBucketSampler:
  sample_num=1268, batch_num=42
  key=[(15, 16), (24, 25), (33, 34), (42, 43), (51, 52)]
  cnt=[338, 334, 235, 145, 216]
  batch_size=[32, 32, 32, 32, 32]


Given the samplers, we can create DataLoader, which is iterable.

In [6]:
train_data_loader = DataLoader(data_train,
                               batch_sampler=train_batch_sampler,
                               batchify_fn=train_batchify_fn,
                               num_workers=4)
val_data_loader = DataLoader(data_val,
                             batch_sampler=val_batch_sampler,
                             batchify_fn=test_batchify_fn,
                             num_workers=4)
test_data_loader = DataLoader(data_test,
                              batch_sampler=test_batch_sampler,
                              batchify_fn=test_batchify_fn,
                              num_workers=4)

## Build GNMT Model 

After obtaining DataLoader, we can build the model. The GNTM encoder and decoder can be easily obtained by calling `get_gnmt_encoder_decoder` function. Then, we feed encoder and decoder to `NMTModel` to construct the GNMT model. `model.hybridize` allows computation to be done using symbolic backend. 

In [7]:
encoder, decoder = get_gnmt_encoder_decoder(hidden_size=num_hidden,
                                            dropout=dropout,
                                            num_layers=num_layers,
                                            num_bi_layers=num_bi_layers)
model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
                 embed_size=num_hidden, prefix='gnmt_')
model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
static_alloc = True
model.hybridize(static_alloc=static_alloc)
logging.info(model)

loss_function = SoftmaxCEMaskedLoss()
loss_function.hybridize(static_alloc=static_alloc)

2018-08-09 00:36:01,295 - root - NMTModel(
  (src_embed): HybridSequential(
    (0): Embedding(17191 -> 512, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (decoder): GNMTDecoder(
    (rnn_cells): HybridSequential(
      (0): LSTMCell(None -> 2048)
      (1): LSTMCell(None -> 2048)
    )
    (attention_cell): DotProductAttentionCell(
      (_proj_query): Dense(None -> 512, linear)
      (_dropout_layer): Dropout(p = 0.0, axes=())
    )
    (dropout_layer): Dropout(p = 0.2, axes=())
  )
  (tgt_embed): HybridSequential(
    (0): Embedding(7709 -> 512, float32)
    (1): Dropout(p = 0.0, axes=())
  )
  (tgt_proj): Dense(None -> 7709, linear)
  (encoder): GNMTEncoder(
    (rnn_cells): HybridSequential(
      (0): BidirectionalCell(forward=LSTMCell(None -> 2048), backward=LSTMCell(None -> 2048))
      (1): LSTMCell(None -> 2048)
    )
    (dropout_layer): Dropout(p = 0.2, axes=())
  )
)


We can also build the translator the beam search

In [8]:
translator = BeamSearchTranslator(model=model, beam_size=beam_size,
                                  scorer=BeamSearchScorer(alpha=lp_alpha,
                                                          K=lp_k),
                                  max_length=tgt_max_len + 100)
logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))

2018-08-09 00:36:05,021 - root - Use beam_size=10, alpha=1.0, K=5


We define evaluation function as follows. The `evaluate` function use beam search translator to generate outputs for the validation and testing datasets.

In [9]:
def evaluate(data_loader):
    """Evaluate given the data loader

    Parameters
    ----------
    data_loader : DataLoader

    Returns
    -------
    avg_loss : float
        Average loss
    real_translation_out : list of list of str
        The translation output
    """
    translation_out = []
    all_inst_ids = []
    avg_loss_denom = 0
    avg_loss = 0.0
    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
            in enumerate(data_loader):
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        # Calculating Loss
        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
        loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
        avg_loss += loss * (tgt_seq.shape[1] - 1)
        avg_loss_denom += (tgt_seq.shape[1] - 1)
        # Translate
        samples, _, sample_valid_length =\
            translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
        max_score_sample = samples[:, 0, :].asnumpy()
        sample_valid_length = sample_valid_length[:, 0].asnumpy()
        for i in range(max_score_sample.shape[0]):
            translation_out.append(
                [tgt_vocab.idx_to_token[ele] for ele in
                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
    avg_loss = avg_loss / avg_loss_denom
    real_translation_out = [None for _ in range(len(all_inst_ids))]
    for ind, sentence in zip(all_inst_ids, translation_out):
        real_translation_out[ind] = sentence
    return avg_loss, real_translation_out


def write_sentences(sentences, file_path):
    with io.open(file_path, 'w', encoding='utf-8') as of:
        for sent in sentences:
            of.write(' '.join(sent) + '\n')

## Training Epochs

Before entering the training stage, we need to create trainer for updating the parameter. In the following example, we create a trainer that uses ADAM optimzier.

In [10]:
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})

We can then write the training iteration. During the training, we perform the one evaluation on validation and testing dataset every epoch, and record the parameters that give the hightest BLEU score on validation dataset. Before performing forward and backward, we first use `as_in_context` function to copy the mini-batch to GPU. The statement `with mx.autograd.record()` tell Gluon backend to compute the gradients for the part inside the block. 

In [11]:
best_valid_bleu = 0.0
for epoch_id in range(epochs):
    log_avg_loss = 0
    log_avg_gnorm = 0
    log_wc = 0
    log_start_time = time.time()
    for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
            in enumerate(train_data_loader):
        # logging.info(src_seq.context) Context suddenly becomes GPU.
        src_seq = src_seq.as_in_context(ctx)
        tgt_seq = tgt_seq.as_in_context(ctx)
        src_valid_length = src_valid_length.as_in_context(ctx)
        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
        with mx.autograd.record():
            out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
            loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
            loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
            loss.backward()
        grads = [p.grad(ctx) for p in model.collect_params().values()]
        gnorm = gluon.utils.clip_global_norm(grads, clip)
        trainer.step(1)
        src_wc = src_valid_length.sum().asscalar()
        tgt_wc = (tgt_valid_length - 1).sum().asscalar()
        step_loss = loss.asscalar()
        log_avg_loss += step_loss
        log_avg_gnorm += gnorm
        log_wc += src_wc + tgt_wc
        if (batch_id + 1) % log_interval == 0:
            wps = log_wc / (time.time() - log_start_time)
            logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
                         'throughput={:.2f}K wps, wc={:.2f}K'
                         .format(epoch_id, batch_id + 1, len(train_data_loader),
                                 log_avg_loss / log_interval,
                                 np.exp(log_avg_loss / log_interval),
                                 log_avg_gnorm / log_interval,
                                 wps / 1000, log_wc / 1000))
            log_start_time = time.time()
            log_avg_loss = 0
            log_avg_gnorm = 0
            log_wc = 0
    valid_loss, valid_translation_out = evaluate(val_data_loader)
    valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
    logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
                 .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
    test_loss, test_translation_out = evaluate(test_data_loader)
    test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
    logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
                 .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
    write_sentences(valid_translation_out,
                    os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
    write_sentences(test_translation_out,
                    os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
    if valid_bleu_score > best_valid_bleu:
        best_valid_bleu = valid_bleu_score
        save_path = os.path.join(save_dir, 'valid_best.params')
        logging.info('Save best parameters to {}'.format(save_path))
        model.save_parameters(save_path)
    if epoch_id + 1 >= (epochs * 2) // 3:
        new_lr = trainer.learning_rate * lr_update_factor
        logging.info('Learning rate change to {}'.format(new_lr))
        trainer.set_learning_rate(new_lr)

2018-08-09 00:36:17,893 - root - [Epoch 0 Batch 10/1043] loss=7.7375, ppl=2292.6596, gnorm=1.4907, throughput=13.26K wps, wc=54.27K
2018-08-09 00:36:20,161 - root - [Epoch 0 Batch 20/1043] loss=6.3590, ppl=577.6419, gnorm=1.5744, throughput=22.16K wps, wc=50.20K
2018-08-09 00:36:22,992 - root - [Epoch 0 Batch 30/1043] loss=6.3708, ppl=584.5321, gnorm=0.8044, throughput=23.95K wps, wc=67.78K
2018-08-09 00:36:25,673 - root - [Epoch 0 Batch 40/1043] loss=6.1795, ppl=482.7611, gnorm=0.6211, throughput=23.58K wps, wc=63.19K
2018-08-09 00:36:28,401 - root - [Epoch 0 Batch 50/1043] loss=6.1892, ppl=487.4410, gnorm=0.4055, throughput=22.71K wps, wc=61.93K
2018-08-09 00:36:30,953 - root - [Epoch 0 Batch 60/1043] loss=6.1059, ppl=448.5020, gnorm=0.6838, throughput=23.21K wps, wc=59.19K
2018-08-09 00:36:34,068 - root - [Epoch 0 Batch 70/1043] loss=6.1564, ppl=471.7430, gnorm=0.4561, throughput=23.44K wps, wc=72.99K
2018-08-09 00:36:36,870 - root - [Epoch 0 Batch 80/1043] loss=6.0698, ppl=432.6081

2018-08-09 00:38:57,007 - root - [Epoch 0 Batch 640/1043] loss=4.3345, ppl=76.2857, gnorm=0.3303, throughput=22.86K wps, wc=57.32K
2018-08-09 00:38:59,831 - root - [Epoch 0 Batch 650/1043] loss=4.4754, ppl=87.8291, gnorm=0.2933, throughput=23.53K wps, wc=66.42K
2018-08-09 00:39:01,907 - root - [Epoch 0 Batch 660/1043] loss=4.1941, ppl=66.2911, gnorm=0.3411, throughput=21.40K wps, wc=44.42K
2018-08-09 00:39:05,118 - root - [Epoch 0 Batch 670/1043] loss=4.6006, ppl=99.5423, gnorm=0.2661, throughput=24.20K wps, wc=77.68K
2018-08-09 00:39:07,705 - root - [Epoch 0 Batch 680/1043] loss=4.4242, ppl=83.4484, gnorm=0.2748, throughput=22.82K wps, wc=59.02K
2018-08-09 00:39:09,958 - root - [Epoch 0 Batch 690/1043] loss=4.2089, ppl=67.2851, gnorm=0.3153, throughput=22.45K wps, wc=50.54K
2018-08-09 00:39:12,316 - root - [Epoch 0 Batch 700/1043] loss=4.2952, ppl=73.3447, gnorm=0.2985, throughput=22.26K wps, wc=52.45K
2018-08-09 00:39:14,300 - root - [Epoch 0 Batch 710/1043] loss=4.1837, ppl=65.6061,

2018-08-09 00:41:57,559 - root - [Epoch 1 Batch 180/1043] loss=3.8040, ppl=44.8808, gnorm=0.3003, throughput=21.69K wps, wc=53.09K
2018-08-09 00:41:59,793 - root - [Epoch 1 Batch 190/1043] loss=3.6796, ppl=39.6292, gnorm=0.3289, throughput=21.63K wps, wc=48.29K
2018-08-09 00:42:02,439 - root - [Epoch 1 Batch 200/1043] loss=3.9207, ppl=50.4379, gnorm=0.4044, throughput=23.66K wps, wc=62.59K
2018-08-09 00:42:04,980 - root - [Epoch 1 Batch 210/1043] loss=3.7377, ppl=41.9998, gnorm=0.3113, throughput=23.12K wps, wc=58.70K
2018-08-09 00:42:07,984 - root - [Epoch 1 Batch 220/1043] loss=3.9903, ppl=54.0730, gnorm=0.3114, throughput=22.33K wps, wc=67.07K
2018-08-09 00:42:10,349 - root - [Epoch 1 Batch 230/1043] loss=3.7459, ppl=42.3477, gnorm=0.3176, throughput=22.24K wps, wc=52.58K
2018-08-09 00:42:12,649 - root - [Epoch 1 Batch 240/1043] loss=3.6709, ppl=39.2868, gnorm=0.3185, throughput=22.55K wps, wc=51.83K
2018-08-09 00:42:15,113 - root - [Epoch 1 Batch 250/1043] loss=3.7696, ppl=43.3626,

2018-08-09 00:44:35,002 - root - [Epoch 1 Batch 810/1043] loss=3.3919, ppl=29.7211, gnorm=0.3311, throughput=22.34K wps, wc=52.84K
2018-08-09 00:44:37,356 - root - [Epoch 1 Batch 820/1043] loss=3.3596, ppl=28.7771, gnorm=0.3546, throughput=22.68K wps, wc=53.37K
2018-08-09 00:44:40,252 - root - [Epoch 1 Batch 830/1043] loss=3.5207, ppl=33.8094, gnorm=0.3184, throughput=23.84K wps, wc=69.01K
2018-08-09 00:44:43,358 - root - [Epoch 1 Batch 840/1043] loss=3.5542, ppl=34.9582, gnorm=0.2961, throughput=24.61K wps, wc=76.39K
2018-08-09 00:44:45,862 - root - [Epoch 1 Batch 850/1043] loss=3.4535, ppl=31.6093, gnorm=0.3355, throughput=23.07K wps, wc=57.73K
2018-08-09 00:44:48,189 - root - [Epoch 1 Batch 860/1043] loss=3.3361, ppl=28.1081, gnorm=0.3342, throughput=21.90K wps, wc=50.93K
2018-08-09 00:44:50,189 - root - [Epoch 1 Batch 870/1043] loss=3.2230, ppl=25.1032, gnorm=0.3695, throughput=21.58K wps, wc=43.16K
2018-08-09 00:44:52,295 - root - [Epoch 1 Batch 880/1043] loss=3.2658, ppl=26.2001,