In [1]:
import mxnet as mx
from mxnet import gluon, autograd
from mxnet import ndarray as nd
from mxnet.gluon import nn, rnn
import math
import numpy as np

import time
from datetime import timedelta

from preprocessing_zh import Corpus, LMDataset

In [None]:
def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()"""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except:
        ctx = mx.cpu()
    return ctx

In [2]:
context = try_gpu()

In [3]:
class LMConfig(object):
    rnn_type = 'LSTM'
    embedding_dim = 200
    hidden_dim = 200
    num_layers = 2
    dropout = 0.5
    
    batch_size = 20
    seq_len = 30
    learning_rate = 1.
    optimizer = 'sgd'
    grad_clip = 0.25
    
    tie_weights = True
    
    num_epochs = 2
    print_per_batch = 50

In [4]:
class RNNModel(nn.Block):
    def __init__(self, config, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        
        vocab_size = config.vocab_size
        embedding_dim = config.embedding_dim
        hidden_dim = config.hidden_dim
        dropout = config.dropout
        num_layers = config.num_layers
        rnn_type = config.rnn_type
        tie_weights = config.tie_weights
        
        self.hidden_dim = hidden_dim
        
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            
            if rnn_type in ['RNN', 'LSTM', 'GRU']:
                self.rnn = getattr(rnn, rnn_type)(hidden_dim, num_layers, dropout=dropout)
            else:
                raise ValueError("Invalid rnn_type %s. Options are RNN, LSTM, GRU" % rnn_type)
                
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, params=self.encoder.params)
            else:
                self.decoder = nn.Dense(vocab_size)
            
    def forward(self, inputs, hidden):
        embedded = self.drop(self.embedding(inputs))
        output, hidden = self.rnn(embedded, hidden)
        decoded = self.decoder(output.reshape((-1, self.hidden_dim)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [5]:
train_dir = 'data/weicheng.txt'
vocab_dir = 'data/weicheng.vocab.txt'

corpus = Corpus(train_dir, vocab_dir)
print(corpus)

Corpus length: 242052, Vocabulary size: 3423.


In [6]:
config = LMConfig()
config.vocab_size = len(corpus.words)
train_data = LMDataset(corpus.data, config.batch_size, config.seq_len)
print(train_data)

Num of batches: 403, Batch Shape: (30, 20)


In [7]:
model = RNNModel(config)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

In [8]:
trainer = gluon.Trainer(model.collect_params(), config.optimizer, {'learning_rate': config.learning_rate})
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()

In [9]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [10]:
def get_time_dif(start_time):
    """
    Return the time used since start_time.
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [11]:
def generate(word_len=100):
    start_index = np.random.randint(config.vocab_size)
    word_list = [start_index]
    
    inputs = nd.array([word_list]).as_in_context(context)
    hidden = model.begin_state(func=nd.zeros, batch_size=1, ctx=context)
    
    with autograd.record(train_mode=False):
        for i in range(word_len):
            hidden = detach(hidden)
            output, hidden = model(inputs, hidden)
            output_id = int(nd.argmax(output, 1).asscalar())
            word_list.append(output_id)
            inputs = nd.array([[output_id]]).as_in_context(context)
    return word_list

In [12]:
grad_clip = config.grad_clip
seq_len = config.seq_len
batch_size = config.batch_size
start_time = time.time()

for epoch in range(50):
    total_loss = 0.0
    hidden = model.begin_state(func=nd.zeros, batch_size=batch_size, ctx=context)
    for ibatch, (data, label) in enumerate(train_data):
        data = nd.array(data).as_in_context(context)
        label = nd.array(label).as_in_context(context)
        hidden = detach(hidden)
        
        with autograd.record(train_mode=True):
            output, hidden = model(data, hidden)
            loss = loss_func(output, label)
            
        loss.backward()
        
        grads = [x.grad(context) for x in model.collect_params().values()]
        gluon.utils.clip_global_norm(grads, grad_clip * seq_len * batch_size)
        
        trainer.step(config.batch_size)
        total_loss += nd.sum(loss).asscalar()
        
        if ibatch % config.print_per_batch == 0 and ibatch > 0:
            cur_loss = total_loss / seq_len / batch_size / config.print_per_batch
            print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_loss, math.exp(cur_loss)))
            total_loss = 0.0
    print(''.join(corpus.to_word(generate())))

[Epoch 1 Batch 50] loss 7.34, perplexity 1534.54
[Epoch 1 Batch 100] loss 5.86, perplexity 350.37
[Epoch 1 Batch 150] loss 5.89, perplexity 362.17
[Epoch 1 Batch 200] loss 5.92, perplexity 373.28
[Epoch 1 Batch 250] loss 5.63, perplexity 278.89
[Epoch 1 Batch 300] loss 5.34, perplexity 208.86
[Epoch 1 Batch 350] loss 5.26, perplexity 192.52
[Epoch 1 Batch 400] loss 5.37, perplexity 215.86
酪，““我们们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的““我们们们的
[Epoch 2 Batch 50] loss 5.65, perplexity 284.80
[Epoch 2 Batch 100] loss 4.88, perplexity 131.45
[Epoch 2 Batch 150] loss 5.10, perplexity 163.92
[Epoch 2 Batch 200] loss 5.23, perplexity 187.66
[Epoch 2 Batch 250] loss 5.00, perplexity 148.11
[Epoch 2 Batch 300] loss 4.76, perplexity 116.19
[Epoch 2 Batch 350] loss 4.74, perplexity 114.19
[Epoch 2 Batch 400] loss 4.98, perplexity 144.98
苍，他们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，我们说：“我们说，
[Epoch 3 Batc

[Epoch 17 Batch 400] loss 4.23, perplexity 68.43
侦探，他们俩说，他们俩说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，他们说，
[Epoch 18 Batch 50] loss 4.50, perplexity 90.25
[Epoch 18 Batch 100] loss 3.88, perplexity 48.60
[Epoch 18 Batch 150] loss 4.12, perplexity 61.81
[Epoch 18 Batch 200] loss 4.30, perplexity 73.54
[Epoch 18 Batch 250] loss 4.10, perplexity 60.37
[Epoch 18 Batch 300] loss 3.92, perplexity 50.21
[Epoch 18 Batch 350] loss 3.96, perplexity 52.64
[Epoch 18 Batch 400] loss 4.20, perplexity 66.44
吧，我就是个孩子，就是个人，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个人，他们俩不知道，就是个
[Epoch 19 Batch 50] loss 4.49, perplexity 88.75
[Epoch 19 Batch 100] loss 3.87, perplexity 47.81
[Epoch 19 Batch 150] loss 4.12, perplexity 61.72
[Epoch 19 Batch 200] loss 4.27, perplexity 71.82
[Epoch 19 Batch 250] loss 4.08, perplexity 59.10
[Epoch 19 Batch 300] loss 3.91, perplexity 49.81
[Epoch 19 Batch 350] loss 3.95, perplexity 51.95
[Epoch 19 Batc

[Epoch 34 Batch 250] loss 3.97, perplexity 52.83
[Epoch 34 Batch 300] loss 3.81, perplexity 45.20
[Epoch 34 Batch 350] loss 3.85, perplexity 46.94
[Epoch 34 Batch 400] loss 4.07, perplexity 58.53
女儿，钟书的父亲和叔父和钟书的“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“
[Epoch 35 Batch 50] loss 4.35, perplexity 77.76
[Epoch 35 Batch 100] loss 3.74, perplexity 42.10
[Epoch 35 Batch 150] loss 4.00, perplexity 54.63
[Epoch 35 Batch 200] loss 4.14, perplexity 63.10
[Epoch 35 Batch 250] loss 3.95, perplexity 52.18
[Epoch 35 Batch 300] loss 3.80, perplexity 44.89
[Epoch 35 Batch 350] loss 3.84, perplexity 46.43
[Epoch 35 Batch 400] loss 4.09, perplexity 59.45
讽刺，不知道何同。他们俩俩俩家里的人都是个复合，他们俩不是“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴气”，“痴
[Epoch 36 Batch 50] loss 4.36, perplexity 77.97
[Epoch 36 Batch 100] loss 3.73, perplexity 41.79
[Epoch 36 Batch 150] loss 4.00, perplexity 54.82
[Epoch 36 Batch 200] loss 4.14, perplexity 62.78
[Epoch 36 Batc