In [1]:
import math
import os
import time
import numpy as np
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn, rnn

  import OpenSSL.SSL


In [2]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [3]:
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path)

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = list(line.strip()) + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
        
        # Tokenize file content
        with open(path, 'r', encoding='utf-8') as f:
            ids = np.zeros(tokens)
            token = 0
            for line in f:
                words = list(line.strip()) + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
            
        return mx.nd.array(ids, dtype='int32')

In [4]:
class RNNModel(gluon.Block):
    """A model with an encoder, recurrent layer, and a decoder."""

    def __init__(self, mode, vocab_size, num_embed, num_hidden,
                 num_layers, dropout=0.5, tie_weights=False, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        with self.name_scope():
            self.drop = nn.Dropout(dropout)
            self.encoder = nn.Embedding(vocab_size, num_embed,
                                        weight_initializer = mx.init.Uniform(0.1))
            if mode == 'rnn_relu':
                self.rnn = rnn.RNN(num_hidden, num_layers, activation='relu', dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'rnn_tanh':
                self.rnn = rnn.RNN(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            elif mode == 'lstm':
                self.rnn = rnn.LSTM(num_hidden, num_layers, dropout=dropout,
                                    input_size=num_embed)
            elif mode == 'gru':
                self.rnn = rnn.GRU(num_hidden, num_layers, dropout=dropout,
                                   input_size=num_embed)
            else:
                raise ValueError("Invalid mode %s. Options are rnn_relu, "
                                 "rnn_tanh, lstm, and gru"%mode)
            if tie_weights:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden,
                                        params = self.encoder.params)
            else:
                self.decoder = nn.Dense(vocab_size, in_units = num_hidden)
            self.num_hidden = num_hidden

    def forward(self, inputs, hidden):
        emb = self.drop(self.encoder(inputs))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [5]:
args_data = 'data/weicheng.txt'
args_model = 'lstm'
args_emsize = 100
args_nhid = 100
args_nlayers = 2
args_lr = 1.0
args_clip = 0.2
args_epochs = 1
args_batch_size = 32
args_bptt = 5
args_dropout = 0.2
args_tied = True
args_cuda = 'store_true'
args_log_interval = 100
args_save = 'model.param'

In [6]:
context = mx.gpu()
corpus = Corpus(args_data)

def batchify(data, batch_size):
    """Reshape data into (num_example, batch_size)"""
    nbatch = data.shape[0] // batch_size
    data = data[:nbatch * batch_size]
    data = data.reshape((batch_size, nbatch)).T
    return data

train_data = batchify(corpus.train, args_batch_size).as_in_context(context)

In [7]:
ntokens = len(corpus.dictionary)

model = RNNModel(args_model, ntokens, args_emsize, args_nhid,
                       args_nlayers, args_dropout, args_tied)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)
trainer = gluon.Trainer(model.collect_params(), 'sgd',
                        {'learning_rate': args_lr, 'momentum': 0, 'wd': 0})
loss = gluon.loss.SoftmaxCrossEntropyLoss()

In [8]:
def get_batch(source, i):
    seq_len = min(args_bptt, source.shape[0] - 1 - i)
    data = source[i : i + seq_len]
    target = source[i + 1 : i + 1 + seq_len]
    return data, target.reshape((-1,))

def detach(hidden):
    if isinstance(hidden, (tuple, list)):
        hidden = [i.detach() for i in hidden]
    else:
        hidden = hidden.detach()
    return hidden

In [9]:
def eval(data_source):
    total_L = 0.0
    ntotal = 0
    hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx=context)
    for i in range(0, data_source.shape[0] - 1, args_bptt):
        data, target = get_batch(data_source, i)
        output, hidden = model(data, hidden)
        L = loss(output, target)
        total_L += mx.nd.sum(L).asscalar()
        ntotal += L.size
    return total_L / ntotal

In [10]:
def generate(word_len=100):
    start_index = np.random.randint(ntokens)
    word_list = [start_index]
    
    inputs = mx.nd.array([word_list]).as_in_context(context)
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=1, ctx=context)
    
    with autograd.record(train_mode=False):
        for i in range(word_len):
            hidden = detach(hidden)
            output, hidden = model(inputs, hidden)
            # output_id = int(mx.nd.argmax(output, 1).asscalar())
            output_id = mx.nd.random.multinomial(output[0].softmax()).asscalar()
            word_list.append(output_id)
            inputs = mx.nd.array([[output_id]]).as_in_context(context)
    return word_list

In [12]:
def train():
    for epoch in range(10):
        total_L = 0.0
        start_time = time.time()
        hidden = model.begin_state(func = mx.nd.zeros, batch_size = args_batch_size, ctx = context)
        for ibatch, i in enumerate(range(0, train_data.shape[0] - 1, args_bptt)):
            data, target = get_batch(train_data, i)
            hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(data, hidden)
                L = loss(output, target)
                L.backward()

            grads = [i.grad(context) for i in model.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, args_clip * args_bptt * args_batch_size)

            trainer.step(args_batch_size)
            total_L += mx.nd.sum(L).asscalar()

            if ibatch % args_log_interval == 0 and ibatch > 0:
                cur_L = total_L / args_bptt / args_batch_size / args_log_interval
                print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0
        
        print(''.join([corpus.dictionary.idx2word[x] for x in generate()]))
    

In [None]:
train()

[Epoch 1 Batch 100] loss 6.74, perplexity 844.64
[Epoch 1 Batch 200] loss 6.30, perplexity 543.67
[Epoch 1 Batch 300] loss 6.29, perplexity 539.44
[Epoch 1 Batch 400] loss 6.23, perplexity 507.25
[Epoch 1 Batch 500] loss 6.24, perplexity 511.03
[Epoch 1 Batch 600] loss 6.17, perplexity 477.75
[Epoch 1 Batch 700] loss 6.04, perplexity 418.23
[Epoch 1 Batch 800] loss 5.99, perplexity 398.89
[Epoch 1 Batch 900] loss 5.87, perplexity 355.93
[Epoch 1 Batch 1000] loss 5.88, perplexity 359.53
[Epoch 1 Batch 1100] loss 5.85, perplexity 348.34
[Epoch 1 Batch 1200] loss 5.72, perplexity 303.88
[Epoch 1 Batch 1300] loss 5.68, perplexity 294.19
579 55
245 55
246 55
1449 55
63 55
70 55
646 55
278 55
409 55
245 409
160 246
121 55
128 245
550 55
824 55
551 55
104 552
323 210
502 48
37 210
477 55
1204 210
48 55
38 18
35 37
16 210
104 55
146 210
553 55
440 245
152 55
2027 55
11 245
337 210
30 55
72 245
37 210
18 55
11 586
35 30
278 210
35 55
1010 210
1499 55
486 55
397 55
44 55
369 55
18 55
223 586
169

223 37
683 37
117 48
383 179
35 55
86 68
17 55
55 18
566 566
246 246
71 104
545 72
372 17
1001 90
325 55
55 55
104 566
204 30
30 71
905 210
460 152
179 48
119 210
120 120
179 55
86 617
1363 322
1257 1257
515 48
185 17
44 464
1839 48
90 55
18 48
643 586
1436 566
559 559
586 586
114 9
1200 1200
1200 1200
502 551
611 611
116 116
385 551
732 17
11 55
402 210
55 55
245 795
489 246
406 35
78 130
2083 76
802 55
244 55
659 55
430 245
564 55
314 314
557 557
384 55
127 111
55 55
404 245
71 130
130 35
295 552
17 358
552 55
553 553
45 566
1204 1204
1204 1204
162 55
559 55
104 586
甥吩凉、贡险在规桶（teeasfsrsme，这种女者最听不到了，我们也嫁心威数，这话是胆龄有时候有到辛楣跑结去帐里。‘啊！”三奶奶高松年打床一餐，他倒只照秘声笑诉她孙小姐美事，忙也说看了：“大伯伯等！这
[Epoch 7 Batch 100] loss 4.72, perplexity 112.65
[Epoch 7 Batch 200] loss 4.57, perplexity 96.35
[Epoch 7 Batch 300] loss 4.67, perplexity 106.48
[Epoch 7 Batch 400] loss 4.57, perplexity 96.92
[Epoch 7 Batch 500] loss 4.63, perplexity 102.34
[Epoch 7 Batch 600] loss 4.67, perplexity 106.66
[Epoch 7 Batch 700] loss 4.61, 

In [None]:
start_index = np.random.randint(ntokens)
word_list = [start_index]
print(start_index)

In [None]:
inputs = mx.nd.array([word_list]).as_in_context(context)
hidden = model.begin_state(func=mx.nd.zeros, batch_size=1, ctx=context)

hidden = detach(hidden)
output, hidden = model(inputs, hidden)
output

In [None]:
mx.nd.argmax(output.exp(), 1)

In [None]:
output[0][55]

In [None]:
mx.nd.sample_multinomial(output.exp()[0][:100])

In [None]:
output[0][:10]

In [None]:
probs = mx.nd.array([-0.82721889 , 1.06662786 , 0.55473745 , 0.9936285 ,  1.49305928 ,-0.57071215,
  2.0720911 ,  1.53752804,  2.69064689, -1.02780223])
probs

In [None]:
mx.nd.argmax(probs, axis=0)

In [None]:
import torch

In [None]:
for i in range(10):
    print(mx.nd.random.multinomial(probs.softmax()).asscalar())

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [None]:
softmax(probs.asnumpy())

In [None]:
for i in range(10):
    print(np.random.multinomial(1, softmax(probs.asnumpy())).argmax())

In [None]:
torch.multinomial(torch.FloatTensor(probs.asnumpy()), 10)

In [None]:
output.exp()

In [None]:

# output_id = int(mx.nd.argmax(output, 1).asscalar())
output_id = mx.nd.sample_multinomial(output[0]).asscalar()
print(output_id, int(mx.nd.argmax(output, 1).asscalar()))
word_list.append(output_id)
inputs = mx.nd.array([[output_id]]).as_in_context(context)
return word_list