In [131]:
import gluonnlp as nlp
import multiprocessing as mp
import itertools
import numpy as np
from mxnet import gluon, autograd
import time
from mxnet import nd
from gluonnlp.data import batchify as bf
import mxnet as mx
from gluonnlp.data.utils import slice_sequence, concat_sequence, _slice_pad_length
import math


In [100]:
import json, os
from mxnet.gluon.data import SimpleDataset
from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
from gluonnlp.data.registry import register
''' 
When passing keyword arguments to `register`, they are checked to be valid keyword arguments for the registered 
Dataset class constructor and are saved in the registry.'''
from gluonnlp.data.utils import _get_home_dir

In [101]:
@register(segment=['train', 'test'])
class TimeMachineDataLoader(SimpleDataset):
    def __init__(self, segment='train', 
                 root=os.path.join(_get_home_dir(), 'data', 'word_generator')):
        self._data_file = {'train': ('train.txt', '5a84368fee37b6e38dc3c8c4e0365d32'),
                          'test': ('test.txt', '48f4d966d69005504c13815cc2a777d0')}
        root = os.path.expanduser(root)
        if not os.path.isdir(root):
            os.makedirs(root)
        self._root = root
        self._segment = segment
        self._get_data()
        self._file_path = self._get_file_path()
        
        super(TimeMachineDataLoader, self).__init__(self._read_data())
        
    @property
    def file_path(self):
        return self._file_path
    
    def _get_data(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path) or not check_sha1(path, data_hash):
            download('http://archive.org/stream/thetimemachine00035gut/35.txt', path=root)
        with open(os.path.join(root, '35.txt')) as f:
            raw_data = f.read()
        raw_data = raw_data[44332: -24182]
        raw_data_val = raw_data[-len(raw_data)//3:]
        raw_data = raw_data[:2*len(raw_data)//3]
        with open(os.path.join(root, 'train.txt'), 'w+') as output_file:
            output_file.write(raw_data)
        

        with open(os.path.join(root, 'test.txt'), 'w+') as output_file:
            output_file.write(raw_data_val)
                

    def _read_data(self):
        with open(os.path.join(self._root, self._segment+'.txt')) as f:
            samples = list(f.read())
        return samples
    
    def _get_file_path(self):
        data_file_name, data_hash = self._data_file[self._segment]
        root = self._root
        path = os.path.join(root, data_file_name)
        if not os.path.exists(path):
            raise FileNotFoundError
        return path
        


  return register_(class_)


In [102]:
class CharLevelDataSet(SimpleDataset):
    def __init__(self, dataset, tokenizer=nlp.data.SpacyTokenizer('en')):
        self._tokenizer = tokenizer
        #self._dataset = self._tokenizer(dataset[:])
        self._dataset = dataset
    
    @property
    def dataset(self):
        return self._dataset
    
    def batchify(self, vocab, batch_size):
        data = self._dataset[:]
        sample_len = len(data) // batch_size
        return mx.nd.array(vocab[data[:sample_len * batch_size]]).reshape((batch_size, -1)).T
    
    def bptt_batchify(self, bptt, vocab, batch_size):
        data = self.batchify(vocab, batch_size)
        batches = slice_sequence(data, bptt+1, overlap=1)
        return SimpleDataset(batches).transform(lambda x: (x[:min(len(x)-1, bptt), :], x[1:, :]))
        

        
    
            

In [137]:
vocab_size = len(vocab.idx_to_token)
num_embd = 256
num_hidden = 512
num_layers = 3
opt = 'sgd'
lr = .001
momentum = .9
wd = 0
num_gpus = min(16, mx.context.num_gpus())
ctx = [mx.gpu(i) for i in range(num_gpus)]
batch_size = 64
grad_clip = 0.25
log_interval = 200
model_name="CharLSTM"
dataset_name="TimeMachine"

# Data Pipeline

In [116]:
train_dataset, test_dataset = [TimeMachineDataLoader(segment=segment, root='../data/text_generator')
                               for segment in ['train', 'test']]
train_data = CharLevelDataSet(train_dataset)
test_data = CharLevelDataSet(test_dataset)

vocab = nlp.vocab.Vocab(nlp.data.Counter(train_dataset[:] + test_dataset[:]), 
                        padding_token=None, 
                        eos_token=None, 
                        bos_token=None)
train_data, test_data = [x.bptt_batchify(bptt=129, vocab=vocab, batch_size=batch_size)
                        for x in [train_data, test_data]]
print("traindata: {}\n\ntestdata: {}".format(train_data[0], test_data[0]))

traindata: (
[[ 11.   5.   2. ...,   1.   1.   2.]
 [ 17.  19.  22. ...,  25.  11.  11.]
 [ 42.   1.  28. ...,   1.   2.  22.]
 ..., 
 [ 23.   4.   5. ...,   1.  10.   8.]
 [  1.  12.  12. ...,  24.   2.   1.]
 [ 49.   6.  20. ...,   2.   2.  26.]]
<NDArray 129x64 @cpu(0)>, 
[[ 17.  19.  22. ...,  25.  11.  11.]
 [ 42.   1.  28. ...,   1.   2.  22.]
 [  7.   4.   1. ...,  15.   2.   1.]
 ..., 
 [  1.  12.  12. ...,  24.   2.   1.]
 [ 49.   6.  20. ...,   2.   2.  26.]
 [  6.   5.   1. ...,  19.   8.   2.]]
<NDArray 129x64 @cpu(0)>)

testdata: (
[[ 13.   5.   9. ...,   5.   1.   2.]
 [ 20.   3.   2. ...,   1.  11.   1.]
 [  1.   1.   1. ...,  19.   7.   4.]
 ..., 
 [ 12.  14.   2. ...,  12.   5.   3.]
 [  4.  19.  17. ...,   7.  11.   3.]
 [  3.   9.  16. ...,   5.   1.   2.]]
<NDArray 129x64 @cpu(0)>, 
[[ 20.   3.   2. ...,   1.  11.   1.]
 [  1.   1.   1. ...,  19.   7.   4.]
 [ 13.   7.   7. ...,  12.   8.   5.]
 ..., 
 [  4.  19.  17. ...,   7.  11.   3.]
 [  3.   9.  16. ...,   5. 

# Model

In [159]:
class LSTMModel(gluon.Block):
    def __init__(self, vocab_size, num_embd, num_hidden, num_layers, dropout=.5, **kwargs):
        super(LSTMModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = gluon.nn.Dropout(dropout)
            self.encoder = gluon.nn.Embedding(vocab_size, num_embd, weight_initializer=mx.init.Uniform(.1))
            self.lstm = gluon.rnn.LSTM(hidden_size=num_hidden, 
                                       num_layers=num_layers, 
                                       dropout=dropout, 
                                       input_size = num_embd)
            self.decoder = gluon.nn.Dense(units=vocab_size, in_units=num_hidden)
            self.num_hidden = num_hidden
    
    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        #print("EMB_SHAPE: {}".format(emb.shape))
        output, hidden = self.lstm(emb, hidden)
        #print("OUTPUT_SHAPE_IN_MODEL: {}".format(output.shape))
        output = self.drop(output)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.lstm.begin_state(*args, **kwargs)

        


## Params

In [160]:
model = LSTMModel(vocab_size=len(vocab.idx_to_token), num_embd=256, num_hidden=512, num_layers=3)
model.collect_params()

lstmmodel8_ (
  Parameter lstmmodel8_embedding0_weight (shape=(68, 256), dtype=float32)
  Parameter lstmmodel8_lstm0_l0_i2h_weight (shape=(2048, 256), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l0_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l0_i2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l0_h2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l1_i2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l1_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l1_i2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l1_h2h_bias (shape=(2048,), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l2_i2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32'>)
  Parameter lstmmodel8_lstm0_l2_h2h_weight (shape=(2048, 512), dtype=<class 'numpy.float32

In [161]:
model.collect_params().initialize(mx.init.Xavier(), ctx=ctx, force_reinit=True)

In [162]:
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': lr, 'momentum': momentum, 'wd': 0})

In [163]:
loss = gluon.loss.SoftmaxCrossEntropyLoss()

In [164]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [detach(i) for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [165]:
def evaluate(model, data_source, batch_size, ctx):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
    for i, (data, target) in enumerate(data_source):
        data = data.as_in_context(ctx)
        #print("DATA_SHAPE: {}".format(data.shape))
        target = target.reshape((-1, )).as_in_context(ctx)
        #print("TARGET_SHAPE: {}".format(target.shape))
        output, hidden = model(data, hidden)
        #print("OUTPUT_SHAPE: {}".format(output.shape))
        hidden = detach(hidden)
        L = loss(output,target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

In [125]:
evaluate(model=model, data_source=test_data, batch_size=batch_size,ctx=ctx[0])

4.21955379853613

In [147]:
def train(model, train_data, val_data, test_data, epochs, lr, context):
    best_val = float("Inf")
    start_train_time = time.time()
    parameters = model.collect_params().values()
    for epoch in range(epochs):
        total_L = 0.0
        start_epoch_time = time.time()
        start_log_interval_time = time.time()
        hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx) 
                   for ctx in context]
        for i, (data, target) in enumerate(train_data):
            data_list = gluon.utils.split_and_load(data, context, 
                                                   batch_axis=1, even_split=True)
            target_list = gluon.utils.split_and_load(target, context, 
                                                     batch_axis=1, even_split=True)
            hiddens = detach(hiddens)
            L = 0
            Ls = []
            with autograd.record():
                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
                    output, h = model(X, h)
                    batch_L = loss(output, y.reshape(-1,))
                    L = L + batch_L.as_in_context(context[0]) / X.size
                    Ls.append(batch_L / X.size)
                    hiddens[j] = h
            L.backward()
            grads = [p.grad(x.context) for p in parameters for x in data_list]
            gluon.utils.clip_global_norm(grads, grad_clip)

            trainer.step(1)

            total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])

            if i % log_interval == 0 and i > 0:
                cur_L = total_L / log_interval
                print('[Epoch %d Batch %d/%d] loss %.2f, ppl %.2f, '
                      'throughput %.2f samples/s'%(
                    epoch, i, len(train_data), cur_L, math.exp(cur_L), 
                    batch_size * log_interval / (time.time() - start_log_interval_time)))
                total_L = 0.0
                start_log_interval_time = time.time()

        mx.nd.waitall()

        print('[Epoch %d] throughput %.2f samples/s'%(
                    epoch, len(train_data)*batch_size / (time.time() - start_epoch_time)))
        val_L = evaluate(model, val_data, batch_size, context[0])
        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
            epoch, time.time()-start_epoch_time, val_L, math.exp(val_L)))

        if val_L < best_val:
            best_val = val_L
            test_L = evaluate(model, test_data, batch_size, context[0])
            model.save_parameters('../model/{}_{}-{}.params'.format(model_name, dataset_name, epoch))
            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
        else:
            lr = lr*0.25
            print('Learning rate now %f'%(lr))
            trainer.set_learning_rate(lr)

    print('Total training throughput %.2f samples/s'%(
                            (batch_size * len(train_data) * epochs) / 
                            (time.time() - start_train_time)))
    

In [148]:
train(model=model, train_data=train_data, val_data=test_data, test_data=test_data, epochs=1, lr=lr, context=ctx)

[Epoch 0] throughput 842.32 samples/s
[Epoch 0] time cost 1.23s, valid loss 3.07, valid ppl 21.58
test loss 3.07, test ppl 21.58
Total training throughput 616.87 samples/s
