In [1]:
import mxnet as mx
from mxnet import gluon, autograd
from mxnet import ndarray as nd
from mxnet.gluon import nn, rnn
import math
import numpy as np

import time
from datetime import timedelta

from preprocessing_zh import Corpus, LMDataset

  import OpenSSL.SSL


In [2]:
def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()"""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except:
        ctx = mx.cpu()
    return ctx

In [3]:
context = try_gpu()

In [4]:
class LMConfig(object):
    rnn_type = 'LSTM'
    embedding_dim = 200
    hidden_dim = 200
    num_layers = 2
    dropout = 0.5
    
    batch_size = 20
    seq_len = 30
    learning_rate = 1.
    optimizer = 'sgd'
    grad_clip = 0.25
    
    tie_weights = True
    
    num_epochs = 2
    print_per_batch = 50

In [5]:
class RNNModel(nn.Block):
    def __init__(self, config, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        
        vocab_size = config.vocab_size
        embedding_dim = config.embedding_dim
        hidden_dim = config.hidden_dim
        dropout = config.dropout
        num_layers = config.num_layers
        rnn_type = config.rnn_type
        tie_weights = config.tie_weights
        
        self.hidden_dim = hidden_dim
        
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            
            if rnn_type in ['RNN', 'LSTM', 'GRU']:
                self.rnn = getattr(rnn, rnn_type)(hidden_dim, num_layers, dropout=dropout)
            else:
                raise ValueError("Invalid rnn_type %s. Options are RNN, LSTM, GRU" % rnn_type)
                
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, params=self.embedding.params, in_units=hidden_dim)
            else:
                self.decoder = nn.Dense(vocab_size)
            
    def forward(self, inputs, hidden):
        embedded = self.drop(self.embedding(inputs))
        output, hidden = self.rnn(embedded, hidden)
        decoded = self.decoder(output.reshape((-1, self.hidden_dim)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [6]:
train_dir = 'data/sanguoyanyi.txt'
vocab_dir = 'data/sanguo.vocab.txt'

corpus = Corpus(train_dir, vocab_dir)
print(corpus)

Corpus length: 608880, Vocabulary size: 4002.


In [7]:
config = LMConfig()
config.vocab_size = len(corpus.words)
train_data = LMDataset(corpus.data, config.batch_size, config.seq_len)
print(train_data)

Num of batches: 1014, Batch Shape: (30, 20)


In [8]:
model = RNNModel(config)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

In [9]:
trainer = gluon.Trainer(model.collect_params(), config.optimizer, {'learning_rate': config.learning_rate})
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()

In [10]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [11]:
def get_time_dif(start_time):
    """
    Return the time used since start_time.
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [None]:
def generate(word_len=100):
    start_index = np.random.randint(config.vocab_size)
    word_list = [start_index]
    
    inputs = mx.nd.array([word_list]).as_in_context(context)
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=1, ctx=context)
    
    with autograd.record(train_mode=False):
        for i in range(word_len):
            hidden = detach(hidden)
            output, hidden = model(inputs, hidden)
            # output_id = int(mx.nd.argmax(output, 1).asscalar())
            output_id = mx.nd.random.multinomial(output[0].softmax()).asscalar()
            word_list.append(output_id)
            inputs = mx.nd.array([[output_id]]).as_in_context(context)
    return word_list

In [None]:
grad_clip = config.grad_clip
seq_len = config.seq_len
batch_size = config.batch_size
start_time = time.time()

for epoch in range(50):
    total_loss = 0.0
    hidden = model.begin_state(func=nd.zeros, batch_size=batch_size, ctx=context)
    for ibatch, (data, label) in enumerate(train_data):
        data = nd.array(data).as_in_context(context)
        label = nd.array(label).as_in_context(context)
        hidden = detach(hidden)
        
        with autograd.record(train_mode=True):
            output, hidden = model(data, hidden)
            loss = loss_func(output, label)
            
        loss.backward()
        
        grads = [x.grad(context) for x in model.collect_params().values()]
        gluon.utils.clip_global_norm(grads, grad_clip * seq_len * batch_size)
        
        trainer.step(config.batch_size)
        total_loss += nd.sum(loss).asscalar()
        
        if ibatch % config.print_per_batch == 0 and ibatch > 0:
            cur_loss = total_loss / seq_len / batch_size / config.print_per_batch
            print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_loss, math.exp(cur_loss)))
            total_loss = 0.0
    print(''.join(corpus.to_word(generate())))

[Epoch 1 Batch 50] loss 7.37, perplexity 1591.70
[Epoch 1 Batch 100] loss 6.44, perplexity 624.50
[Epoch 1 Batch 150] loss 6.03, perplexity 416.08
[Epoch 1 Batch 200] loss 5.84, perplexity 343.59
[Epoch 1 Batch 250] loss 5.43, perplexity 227.66
[Epoch 1 Batch 300] loss 5.55, perplexity 256.55
[Epoch 1 Batch 350] loss 5.44, perplexity 231.60
[Epoch 1 Batch 400] loss 5.27, perplexity 194.29
[Epoch 1 Batch 450] loss 5.22, perplexity 185.21
[Epoch 1 Batch 500] loss 5.18, perplexity 177.84
[Epoch 1 Batch 550] loss 5.19, perplexity 179.10
[Epoch 1 Batch 600] loss 5.19, perplexity 179.74
[Epoch 1 Batch 650] loss 4.99, perplexity 146.25
[Epoch 1 Batch 700] loss 5.23, perplexity 186.85
[Epoch 1 Batch 750] loss 4.98, perplexity 145.70
[Epoch 1 Batch 800] loss 5.07, perplexity 159.78
[Epoch 1 Batch 850] loss 4.69, perplexity 108.45
[Epoch 1 Batch 900] loss 4.87, perplexity 130.18
[Epoch 1 Batch 950] loss 5.02, perplexity 150.75
[Epoch 1 Batch 1000] loss 4.89, perplexity 132.36
孱书巾逼。哺暗倒曰：“柯春弟诩子王公若

[Epoch 8 Batch 750] loss 4.46, perplexity 86.06
[Epoch 8 Batch 800] loss 4.56, perplexity 95.87
[Epoch 8 Batch 850] loss 4.23, perplexity 68.58
[Epoch 8 Batch 900] loss 4.47, perplexity 87.49
[Epoch 8 Batch 950] loss 4.55, perplexity 94.84
[Epoch 8 Batch 1000] loss 4.51, perplexity 90.93
跖坑。又乘马追奔。前到西方值隔半合”淮大怒。书华亭！”茶者，颇其以密服公：炎妻闲船上兵二十二万　　是日，又来到斜谷山前进兵。壮人直入此处。断了武葛瞻、邓艾、张两宠封金帛赴宫。　　听知姜维释罪。左将押香之
[Epoch 9 Batch 50] loss 4.99, perplexity 147.64
[Epoch 9 Batch 100] loss 4.65, perplexity 104.15
[Epoch 9 Batch 150] loss 4.57, perplexity 96.55
[Epoch 9 Batch 200] loss 4.70, perplexity 110.37
[Epoch 9 Batch 250] loss 4.30, perplexity 73.58
[Epoch 9 Batch 300] loss 4.58, perplexity 97.04
[Epoch 9 Batch 350] loss 4.55, perplexity 94.30
[Epoch 9 Batch 400] loss 4.53, perplexity 92.86
[Epoch 9 Batch 450] loss 4.40, perplexity 81.29
[Epoch 9 Batch 500] loss 4.51, perplexity 91.12
[Epoch 9 Batch 550] loss 4.55, perplexity 94.56
[Epoch 9 Batch 600] loss 4.60, perplexity 99.66
[Epoch 9 Batch 650] loss 4.42, 