In [1]:
import mxnet as mx
from mxnet import gluon, autograd
from mxnet import ndarray as nd
from mxnet.gluon import nn, rnn
import math
import numpy as np

import time
from datetime import timedelta

from preprocessing_zh import Corpus, LMDataset

  import OpenSSL.SSL


In [2]:
context = mx.gpu()

In [3]:
class LMConfig(object):
    rnn_type = 'LSTM'
    embedding_dim = 64
    hidden_dim = 128
    num_layers = 2
    dropout = 0.5
    
    batch_size = 20
    seq_len = 30
    learning_rate = 1.
    optimizer = 'sgd'
    grad_clip = 0.2
    
    num_epochs = 2
    print_per_batch = 50

In [4]:
class RNNModel(nn.Block):
    def __init__(self, config, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        
        vocab_size = config.vocab_size
        embedding_dim = config.embedding_dim
        hidden_dim = config.hidden_dim
        dropout = config.dropout
        num_layers = config.num_layers
        rnn_type = config.rnn_type
        
        self.hidden_dim = hidden_dim
        
        with self.name_scope():
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            
            if rnn_type in ['RNN', 'LSTM', 'GRU']:
                self.rnn = getattr(rnn, rnn_type)(hidden_dim, num_layers, dropout=dropout)
            else:
                raise ValueError("Invalid rnn_type %s. Options are RNN, LSTM, GRU" % rnn_type)
            
            self.decoder = nn.Dense(vocab_size)
            
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        decoded = self.decoder(output.reshape((-1, self.hidden_dim)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [5]:
train_dir = 'data/weicheng.txt'
vocab_dir = 'data/weicheng.vocab.txt'

corpus = Corpus(train_dir, vocab_dir)
print(corpus)

Corpus length: 242052, Vocabulary size: 3423.


In [6]:
config = LMConfig()
config.vocab_size = len(corpus.words)
train_data = LMDataset(corpus.data, config.batch_size, config.seq_len)
print(train_data)

Num of batches: 403, Batch Shape: (30, 20)


In [7]:
model = RNNModel(config)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

In [8]:
trainer = gluon.Trainer(model.collect_params(), config.optimizer, {'learning_rate': config.learning_rate})
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()

In [9]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [10]:
def get_time_dif(start_time):
    """
    Return the time used since start_time.
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


[[[-0.02872103 -0.0062073   0.01894674  0.00115567  0.00982953 -0.03334996
    0.04017451 -0.03026596 -0.04086945 -0.0288078   0.00844669 -0.01405796
   -0.00767495 -0.03884564  0.01807828  0.02760972  0.0129669   0.03335032
    0.01206804  0.01699699 -0.04041287  0.00457676 -0.02235788  0.02971803
   -0.03611144 -0.02411439 -0.03982899  0.00834417 -0.03223059  0.0292489
   -0.0036112  -0.00391947 -0.01478232 -0.01251055  0.02554525  0.0154844
   -0.03377134  0.00314574 -0.01743547  0.00088676  0.04034925  0.0395894
    0.03029975  0.00294292  0.02062422  0.0354268   0.02468068  0.03158569
   -0.02459077  0.00248447 -0.0392098  -0.03820364 -0.02214121 -0.02519343
    0.01526169  0.04145131 -0.03195505  0.03441053  0.03699443 -0.01559923
    0.01796507  0.00595654  0.04034156 -0.01547087]]]
<NDArray 1x1x64 @gpu(0)>

In [27]:
inputs = nd.array([[np.random.randint(config.vocab_size)]]).as_in_context(context)
e_o = model.embedding(inputs)

hidden = model.begin_state(func=nd.zeros, batch_size=1, ctx=context)
output, hidden = model.rnn(e_o, hidden)
output = output.reshape((-1, model.hidden_dim))
output = model.decoder(output)

nd.argmax(output, 1)


[ 1164.]
<NDArray 1 @gpu(0)>

In [40]:
nd.concat(inputs[0], [nd.argmax(output, 1)], dim=1)

AssertionError: Positional arguments must have NDArray type, but got [
[ 1164.]
<NDArray 1 @gpu(0)>]

In [14]:
def generate(word_len=100):
    inputs = nd.array([[np.random.randint(config.vocab_size)]]).as_in_context(context)
    hidden = model.begin_state(func=nd.zeros, batch_size=1, ctx=context)
    word_list = []
    for i in range(word_len):
        hidden = detach(hidden)
        x, hidden = model(inputs, hidden)
        print(x.shape)

In [15]:
generate()

(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)
(1, 3423)


In [65]:
grad_clip = config.grad_clip
seq_len = config.seq_len
batch_size = config.batch_size
start_time = time.time()

for epoch in range(config.num_epochs):
    total_loss = 0.0
    hidden = model.begin_state(func=nd.zeros, batch_size=batch_size, ctx=context)
    for ibatch, (data, label) in enumerate(train_data):
        data = nd.array(data).as_in_context(context)
        print(data.shape)
        label = nd.array(label).as_in_context(context)
        hidden = detach(hidden)
        
        with autograd.record():
            output, hidden = model(data, hidden)
            loss = loss_func(output, label)
            
        loss.backward()
        
        grads = [x.grad(context) for x in model.collect_params().values()]
        gluon.utils.clip_global_norm(grads, grad_clip * seq_len * batch_size)
        
        trainer.step(config.batch_size)
        total_loss += nd.sum(loss).asscalar()
        
        if ibatch % config.print_per_batch == 0 and ibatch > 0:
            cur_loss = total_loss / seq_len / batch_size / config.print_per_batch
            print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_loss, math.exp(cur_loss)))
            total_loss = 0.0

(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
[Epoch 1 Batch 50] loss 5.36, perplexity 212.69
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)
(30, 20)


KeyboardInterrupt: 