In [1]:
import mxnet as mx
from mxnet import gluon, autograd
from mxnet import ndarray as nd
from mxnet.gluon import nn, rnn
import math
import numpy as np
import os

import time
from datetime import timedelta

from preprocessing_zh import Corpus, LMDataset

  import OpenSSL.SSL


In [2]:
def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()"""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except:
        ctx = mx.cpu()
    return ctx

In [3]:
context = try_gpu()

In [4]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [5]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path)

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = list(line.strip()) + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
        
        # Tokenize file content
        with open(path, 'r', encoding='utf-8') as f:
            ids = np.zeros(tokens)
            token = 0
            for line in f:
                words = list(line.strip()) + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
            
        return ids

In [6]:
class LMDataset(object):
    def __init__(self, raw_data, batch_size, seq_len):
        num_batch = len(raw_data) // (batch_size * seq_len)

        data = raw_data[:(num_batch * batch_size * seq_len)]
        data = data.reshape(num_batch, batch_size, -1).swapaxes(1, 2)

        target = raw_data[1:(num_batch * batch_size * seq_len + 1)]
        target = target.reshape(num_batch, batch_size, -1).swapaxes(1, 2).reshape(num_batch, -1)

        self.data = data
        self.target = target

    def __getitem__(self, index):
        return self.data[index], self.target[index]

    def __len__(self):
        return len(self.data)

    def __repr__(self):
        return 'Num of batches: %d, Batch Shape: %s' % (len(self.data), self.data[0].shape)

In [7]:
class LMConfig(object):
    rnn_type = 'LSTM'
    embedding_dim = 200
    hidden_dim = 200
    num_layers = 2
    dropout = 0.5
    
    batch_size = 20
    seq_len = 30
    learning_rate = 1.
    optimizer = 'sgd'
    grad_clip = 0.25
    
    tie_weights = True
    
    num_epochs = 2
    print_per_batch = 50

In [8]:
class RNNModel(nn.Block):
    def __init__(self, config, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        
        vocab_size = config.vocab_size
        embedding_dim = config.embedding_dim
        hidden_dim = config.hidden_dim
        dropout = config.dropout
        num_layers = config.num_layers
        rnn_type = config.rnn_type
        tie_weights = config.tie_weights
        
        self.hidden_dim = hidden_dim
        
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            
            if rnn_type in ['RNN', 'LSTM', 'GRU']:
                self.rnn = getattr(rnn, rnn_type)(hidden_dim, num_layers, dropout=dropout)
            else:
                raise ValueError("Invalid rnn_type %s. Options are RNN, LSTM, GRU" % rnn_type)
                
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, params=self.embedding.params, in_units=hidden_dim)
            else:
                self.decoder = nn.Dense(vocab_size)
            
    def forward(self, inputs, hidden):
        embedded = self.drop(self.embedding(inputs))
        output, hidden = self.rnn(embedded, hidden)
        decoded = self.decoder(output.reshape((-1, self.hidden_dim)))
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [10]:
train_dir = 'data/weicheng.txt'

corpus = Corpus(train_dir)

In [12]:
config = LMConfig()
config.vocab_size = len(corpus.dictionary)
train_data = LMDataset(corpus.train, config.batch_size, config.seq_len)
print(train_data)

Num of batches: 363, Batch Shape: (30, 20)


In [13]:
model = RNNModel(config)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

In [14]:
trainer = gluon.Trainer(model.collect_params(), config.optimizer, {'learning_rate': config.learning_rate})
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()

In [15]:
def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [16]:
def get_time_dif(start_time):
    """
    Return the time used since start_time.
    """
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [17]:
def generate(word_len=100):
    start_index = np.random.randint(config.vocab_size)
    word_list = [start_index]
    
    inputs = nd.array([word_list]).as_in_context(context)
    hidden = model.begin_state(func=nd.zeros, batch_size=1, ctx=context)
    
    with autograd.record(train_mode=False):
        for i in range(word_len):
            hidden = detach(hidden)
            output, hidden = model(inputs, hidden)
            output_id = int(nd.argmax(output, 1).asscalar())
            word_list.append(output_id)
            inputs = nd.array([[output_id]]).as_in_context(context)
    return word_list

In [None]:
grad_clip = config.grad_clip
seq_len = config.seq_len
batch_size = config.batch_size
start_time = time.time()

for epoch in range(50):
    total_loss = 0.0
    hidden = model.begin_state(func=nd.zeros, batch_size=batch_size, ctx=context)
    for ibatch, (data, label) in enumerate(train_data):
        data = nd.array(data).as_in_context(context)
        label = nd.array(label).as_in_context(context)
        hidden = detach(hidden)
        
        with autograd.record(train_mode=True):
            output, hidden = model(data, hidden)
            loss = loss_func(output, label)
            
        loss.backward()
        
        grads = [x.grad(context) for x in model.collect_params().values()]
        gluon.utils.clip_global_norm(grads, grad_clip * seq_len * batch_size)
        
        trainer.step(config.batch_size)
        total_loss += nd.sum(loss).asscalar()
        
        if ibatch % config.print_per_batch == 0 and ibatch > 0:
            cur_loss = total_loss / seq_len / batch_size / config.print_per_batch
            print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_loss, math.exp(cur_loss)))
            total_loss = 0.0
    print(''.join([corpus.dictionary.idx2word[x] for x in generate()]))

[Epoch 1 Batch 50] loss 5.81, perplexity 333.02
[Epoch 1 Batch 100] loss 5.20, perplexity 180.90
[Epoch 1 Batch 150] loss 5.34, perplexity 208.42
[Epoch 1 Batch 200] loss 5.24, perplexity 189.01
[Epoch 1 Batch 250] loss 5.16, perplexity 174.81
[Epoch 1 Batch 300] loss 4.84, perplexity 126.51
[Epoch 1 Batch 350] loss 4.82, perplexity 124.07
穷，不是你的。”<eos>鸿渐道：“你是我的事，你不是你的。”<eos>鸿渐道：“你是我的事，你不是你的。”<eos>鸿渐道：“你是我的事，你不是你的。”<eos>鸿渐道：“你是我的事，你不是你的。”<eos>鸿渐道：“你是我的事，你不是你的
[Epoch 2 Batch 50] loss 5.45, perplexity 231.63
[Epoch 2 Batch 100] loss 4.88, perplexity 131.68
[Epoch 2 Batch 150] loss 5.07, perplexity 158.60
[Epoch 2 Batch 200] loss 5.00, perplexity 148.41
[Epoch 2 Batch 250] loss 4.91, perplexity 136.25
[Epoch 2 Batch 300] loss 4.63, perplexity 102.07
[Epoch 2 Batch 350] loss 4.62, perplexity 101.70
龟，不知道。他们这时候不到他的，不知道这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样，他们这样
[Epoch 3 Batch 50] loss 5.28, perplexity 196.94
[Epoch 3 Batch 100] loss 4.73, perplexity 11

[Epoch 19 Batch 150] loss 4.57, perplexity 96.86
[Epoch 19 Batch 200] loss 4.50, perplexity 90.39
[Epoch 19 Batch 250] loss 4.47, perplexity 86.96
[Epoch 19 Batch 300] loss 4.21, perplexity 67.41
[Epoch 19 Batch 350] loss 4.25, perplexity 69.88
勋，不会再来。”<eos>鸿渐道：“你这人真是个人，我不但再去了。”<eos>鸿渐道：“你这人真是个人，我不但再去了。”<eos>鸿渐道：“你这人真是个人，我不但再去了。”<eos>鸿渐道：“你这人真是个人，我不但再去了。”<eos>鸿渐道：
[Epoch 20 Batch 50] loss 4.86, perplexity 128.41
[Epoch 20 Batch 100] loss 4.36, perplexity 78.19
[Epoch 20 Batch 150] loss 4.56, perplexity 95.22
[Epoch 20 Batch 200] loss 4.49, perplexity 89.22
[Epoch 20 Batch 250] loss 4.46, perplexity 86.30
[Epoch 20 Batch 300] loss 4.21, perplexity 67.39
[Epoch 20 Batch 350] loss 4.24, perplexity 69.20
嘻出来，说：“你这时候没有？”<eos>鸿渐道：“你这时候我不愿意到我家里，你不是我的。”<eos>鸿渐道：“你这时候我不愿意到我家里，你不是我的。”<eos>鸿渐道：“你这时候我不愿意到我家里，你不是我的。”<eos>鸿渐道：“你这
[Epoch 21 Batch 50] loss 4.85, perplexity 127.89
[Epoch 21 Batch 100] loss 4.36, perplexity 78.02
[Epoch 21 Batch 150] loss 4.55, perplexity 94.52
[Epoch 21 Batch 200] loss

Exception ignored in: 