In [1]:
"""A simple demo of new RNN cell with PTB language model."""

import os

import numpy as np
import mxnet as mx
import rnn
from bucket_io import BucketSentenceIter, default_build_vocab


# data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
data_dir = '/home/ubuntu/h2o-3/bigdata/laptop/ptb'

def Perplexity(label, pred):
    # TODO(tofix): we make a transpose of label here, because when
    # using the RNN cell, we called swap axis to the data.
    label = label.T.reshape((-1,))
    loss = 0.
    for i in range(pred.shape[0]):
        loss += -np.log(max(1e-10, pred[i][int(label[i])]))
    return np.exp(loss / label.size)



In [2]:

if __name__ == '__main__':
    batch_size = 128
    buckets = [10, 20, 30, 40, 50, 60]
    num_hidden = 200
    num_embed = 200
    num_lstm_layer = 2

    num_epoch = 2
    learning_rate = 0.01
    momentum = 0.0

    contexts = [mx.context.gpu(i) for i in range(1)]
    vocab = default_build_vocab(os.path.join(data_dir, 'ptb.train.txt'))

    init_h = [('LSTM_init_h', (batch_size, num_lstm_layer, num_hidden))]
    init_c = [('LSTM_init_c', (batch_size, num_lstm_layer, num_hidden))]
    init_states = init_c + init_h

    data_train = BucketSentenceIter(os.path.join(data_dir, 'ptb.train.txt'),
                                    vocab, buckets, batch_size, init_states)
    data_val = BucketSentenceIter(os.path.join(data_dir, 'ptb.valid.txt'),
                                  vocab, buckets, batch_size, init_states)

    def sym_gen(seq_len):
        data = mx.symbol.Variable('data')
        label = mx.symbol.Variable('softmax_label')
        embed = mx.symbol.Embedding(data=data, input_dim=len(vocab),
                                 output_dim=num_embed, name='embed')

        # TODO(tofix)
        # The inputs and labels from IO are all in batch-major.
        # We need to transform them into time-major to use RNN cells.
        embed_tm = mx.symbol.SwapAxis(embed, dim1=0, dim2=1)
        label_tm = mx.symbol.SwapAxis(label, dim1=0, dim2=1)

        # TODO(tofix)
        # Create transformed RNN initial states. Normally we do
        # no need to do this. But the RNN symbol expects the state
        # to be time-major shape layout, while the current mxnet
        # IO and high-level training logic assume everything from
        # the data iter have batch_size as the first dimension.
        # So until we have extended our IO and training logic to
        # support this more general case, this dummy axis swap is
        # needed.
        rnn_h_init = mx.symbol.SwapAxis(mx.symbol.Variable('LSTM_init_h'),
                                     dim1=0, dim2=1)
        rnn_c_init = mx.symbol.SwapAxis(mx.symbol.Variable('LSTM_init_c'),
                                     dim1=0, dim2=1)

        # TODO(tofix)
        # currently all the LSTM parameters are concatenated as
        # a huge vector, and named '<name>_parameters'. By default
        # mxnet initializer does not know how to initilize this
        # guy because its name does not ends with _weight or _bias
        # or anything familiar. Here we just use a temp workaround
        # to create a variable and name it as LSTM_bias to get
        # this demo running. Note by default bias is initialized
        # as zeros, so this is not a good scheme. But calling it
        # LSTM_weight is not good, as this is 1D vector, while
        # the initialization scheme of a weight parameter needs
        # at least two dimensions.
        rnn_params = mx.symbol.Variable('LSTM_bias')

        # RNN cell takes input of shape (time, batch, feature)
        rnn = mx.symbol.RNN(data=embed_tm, state_size=num_hidden,
                         num_layers=num_lstm_layer, mode='lstm',
                         name='LSTM', 
                         # The following params can be omitted
                         # provided we do not need to apply the
                         # workarounds mentioned above
                         state=rnn_h_init,
                         state_cell=rnn_c_init, 
                         parameters=rnn_params)

        # the RNN cell output is of shape (time, batch, dim)
        # if we need the states and cell states in the last time
        # step (e.g. when building encoder-decoder models), we
        # can set state_outputs=True, and the RNN cell will have
        # extra outputs: rnn['LSTM_output'], rnn['LSTM_state']
        # and for LSTM, also rnn['LSTM_state_cell']

        # now we collapse the time and batch dimension to do the
        # final linear logistic regression prediction
        hidden = mx.symbol.Reshape(data=rnn, shape=(-1, num_hidden))
        label_cl = mx.symbol.Reshape(data=label_tm, shape=(-1,))

        pred = mx.symbol.FullyConnected(data=hidden, num_hidden=len(vocab),
                                     name='pred')
        sm = mx.symbol.SoftmaxOutput(data=pred, label=label_cl, name='softmax')

        data_names = ['data', 'LSTM_init_h', 'LSTM_init_c']
        label_names = ['softmax_label']

        return (sm, data_names, label_names)


bucket of len  10 : 19479 samples
bucket of len  20 : 19336 samples
bucket of len  30 : 12208 samples
bucket of len  40 : 3962 samples
bucket of len  50 : 845 samples
bucket of len  60 : 160 samples
bucket of len  10 : 1531 samples
bucket of len  20 : 1518 samples
bucket of len  30 : 980 samples
bucket of len  40 : 322 samples
bucket of len  50 : 65 samples
bucket of len  60 : 10 samples


In [3]:
if len(buckets) == 1:
    mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts)
else:
    mod = mx.mod.BucketingModule(sym_gen, default_bucket_key=data_train.default_bucket_key,
                                 context=contexts)

import logging
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)

mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
        eval_metric=mx.metric.np(Perplexity),
        batch_end_callback=mx.callback.Speedometer(batch_size, 50),
        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
        optimizer='sgd',
        optimizer_params={'learning_rate': learning_rate,
                          'momentum': momentum, 'wd': 0.00001})

2017-01-04 19:46:50,721 Epoch[0] Batch [50]	Speed: 1048.01 samples/sec	Train-Perplexity=4260.871099
2017-01-04 19:46:56,500 Epoch[0] Batch [100]	Speed: 1107.60 samples/sec	Train-Perplexity=744.343631
2017-01-04 19:47:02,942 Epoch[0] Batch [150]	Speed: 993.64 samples/sec	Train-Perplexity=563.627497
2017-01-04 19:47:09,086 Epoch[0] Batch [200]	Speed: 1041.86 samples/sec	Train-Perplexity=474.455599
2017-01-04 19:47:14,218 Epoch[0] Batch [250]	Speed: 1247.18 samples/sec	Train-Perplexity=307.823897
2017-01-04 19:47:19,753 Epoch[0] Batch [300]	Speed: 1156.57 samples/sec	Train-Perplexity=330.747983
2017-01-04 19:47:25,525 Epoch[0] Batch [350]	Speed: 1108.93 samples/sec	Train-Perplexity=338.972245
2017-01-04 19:47:31,344 Epoch[0] Batch [400]	Speed: 1099.98 samples/sec	Train-Perplexity=337.709651
2017-01-04 19:47:34,969 Epoch[0] Train-Perplexity=280.663323
2017-01-04 19:47:34,970 Epoch[0] Time cost=50.987
2017-01-04 19:47:37,576 Epoch[0] Validation-Perplexity=287.798634
2017-01-04 19:47:43,705 