# MXNET

In [1]:
from mxnet import nd
import os
import mxnet as mx
from subprocess import call
import random
import math
import zipfile
from mxnet import autograd, nd, gluon, init
from mxnet.gluon import loss as gloss, nn, rnn
import time



In [2]:
def try_gpu():
    """If GPU is available, return mx.gpu(0); else return mx.cpu()."""
    try:
        ctx = mx.gpu()
        _ = nd.array([0], ctx=ctx)
    except mx.base.MXNetError:
        ctx = mx.cpu()
    return ctx

In [3]:

X, W_xh = nd.random.normal(shape=(3,1)) , nd.random.normal(shape=(1,4))
H, W_hh = nd.random.normal(shape=(3,4)) , nd.random.normal(shape=(4,4))
nd.concat(X, H, dim=1)


[[ 2.2122064  -0.45138445  0.57938355 -1.856082   -1.9768796 ]
 [ 0.7740038  -0.20801921  0.2444218  -0.03716067 -0.48774993]
 [ 1.0434405  -0.02261727  0.57461417  1.4661262   0.6862904 ]]
<NDArray 3x5 @cpu(0)>

In [4]:
nd.concat(W_xh, W_hh, dim=0)


[[ 1.1839255   1.8917114  -1.2347414  -1.771029  ]
 [ 0.35496104  1.0731696   0.12017461 -0.9711102 ]
 [-0.77569664 -0.7882176   0.7417728  -1.4734439 ]
 [-1.0730928  -1.0424827  -1.3278849  -1.4749662 ]
 [-0.52414197  1.2662556   0.8950642  -0.6015945 ]]
<NDArray 5x4 @cpu(0)>

In [5]:
nd.dot(nd.concat(X, H, dim=1), nd.concat(W_xh, W_hh, dim=0))


[[ 5.0373516   2.6754622  -1.6607479  -0.4062885 ]
 [ 0.94845396  0.46941754 -1.1866102  -1.1806769 ]
 [-1.1514019   0.8373027  -2.1974368  -5.2480164 ]]
<NDArray 3x4 @cpu(0)>

In [6]:
# https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/jaychou_lyrics.txt.zip

In [7]:
base_dir  = 'datasets'
output_dir = 'output'
data_file = os.path.join(base_dir, "jaychou_lyrics.txt.zip")

In [8]:
!pwd 
os.chdir(base_dir)
!pwd && ls 


print("")
print("Downloading...")
if not os.path.exists(data_file):
    call(
        'wget "https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/jaychou_lyrics.txt.zip"',
        shell=True
    )
    print("Downloading done.\n")
else:
    print("Dataset already downloaded. Did not download twice.\n")


!pwd && ls
os.chdir("..")
!pwd 



/mnt/sdf/workspace/git_hub_demo/learn-rnn
/mnt/sdf/workspace/git_hub_demo/learn-rnn/datasets
anna.txt		     im2txt		     spa.txt
babi_tasks_1-20_v1-2.tar.gz  im2txt.zip		     UCI HAR Dataset
babi-tasks-v1-2.tar.gz	     jaychou_lyrics.txt.zip  UCI HAR Dataset.zip
cnews			     mldata

Downloading...
Downloading done.

/mnt/sdf/workspace/git_hub_demo/learn-rnn/datasets
anna.txt		     im2txt		       mldata
babi_tasks_1-20_v1-2.tar.gz  im2txt.zip		       spa.txt
babi-tasks-v1-2.tar.gz	     jaychou_lyrics.txt.zip    UCI HAR Dataset
cnews			     jaychou_lyrics.txt.zip.1  UCI HAR Dataset.zip
/mnt/sdf/workspace/git_hub_demo/learn-rnn


In [9]:
with zipfile.ZipFile(data_file) as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [10]:
corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
print('length: {} '.format(len(corpus_chars)))
corpus_chars = corpus_chars[0:10000]

length: 63282 


In [11]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])

vocab_size = len(char_to_idx)

In [12]:
vocab_size

1027

In [14]:
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print('chars: ', ''.join([idx_to_char[idx] for idx in sample]))
print('indices:', sample)

chars:  想要有直升机 想要和你飞到宇宙去 想要和
indices: [1010, 845, 66, 474, 406, 646, 970, 1010, 845, 713, 157, 443, 886, 680, 915, 441, 970, 1010, 845, 713]


In [15]:
def data_iter_random(corpus_indices, batch_size, num_steps, ctx=None):
    """Sample mini-batches in a random order from sequential data."""
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size
    example_indices = list(range(num_examples))
    random.shuffle(example_indices)

    def _data(pos):
        return corpus_indices[pos : pos + num_steps]

    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = example_indices[i : i + batch_size]
        X = nd.array(
            [_data(j * num_steps) for j in batch_indices], ctx=ctx)
        Y = nd.array(
            [_data(j * num_steps + 1) for j in batch_indices], ctx=ctx)
        yield X, Y

In [16]:
def data_iter_consecutive(corpus_indices, batch_size, num_steps, ctx=None):
    """Sample mini-batches in a consecutive order from sequential data."""
    corpus_indices = nd.array(corpus_indices, ctx=ctx)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    indices = corpus_indices[0 : batch_size * batch_len].reshape((
        batch_size, batch_len))
    epoch_size = (batch_len - 1) // num_steps
    for i in range(epoch_size):
        i = i * num_steps
        X = indices[:, i : i + num_steps]
        Y = indices[:, i + 1 : i + num_steps + 1]
        yield X, Y

In [17]:
my_seq = list(range(60))
for X, Y in data_iter_consecutive(my_seq, batch_size=3, num_steps=5):
    print("X: ", X, '\nY: ', Y, '\n')

X:  
[[ 0.  1.  2.  3.  4.]
 [20. 21. 22. 23. 24.]
 [40. 41. 42. 43. 44.]]
<NDArray 3x5 @cpu(0)> 
Y:  
[[ 1.  2.  3.  4.  5.]
 [21. 22. 23. 24. 25.]
 [41. 42. 43. 44. 45.]]
<NDArray 3x5 @cpu(0)> 

X:  
[[ 5.  6.  7.  8.  9.]
 [25. 26. 27. 28. 29.]
 [45. 46. 47. 48. 49.]]
<NDArray 3x5 @cpu(0)> 
Y:  
[[ 6.  7.  8.  9. 10.]
 [26. 27. 28. 29. 30.]
 [46. 47. 48. 49. 50.]]
<NDArray 3x5 @cpu(0)> 

X:  
[[10. 11. 12. 13. 14.]
 [30. 31. 32. 33. 34.]
 [50. 51. 52. 53. 54.]]
<NDArray 3x5 @cpu(0)> 
Y:  
[[11. 12. 13. 14. 15.]
 [31. 32. 33. 34. 35.]
 [51. 52. 53. 54. 55.]]
<NDArray 3x5 @cpu(0)> 



In [18]:
nd.one_hot(nd.array([0,2]), vocab_size)
print('vocab size {}'.format(vocab_size))
nd.one_hot(nd.array([0,2]), 6)

vocab size 1027



[[1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
<NDArray 2x6 @cpu(0)>

In [19]:
def to_onehot(X, size):
    return [nd.one_hot(x, size) for x in X.T]

In [20]:
X = nd.arange(10).reshape((2, 5))
inputs = to_onehot(X, vocab_size)
len(inputs), inputs[0].shape
print(inputs[0])


[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>


In [21]:
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
ctx = try_gpu()
print('will use', ctx)

will use cpu(0)


In [22]:
def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
    
    W_xh = _one((num_inputs, num_hiddens))
    W_hh = _one((num_hiddens, num_hiddens))
    b_h = nd.zeros(num_hiddens, ctx=ctx)
    
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nd.zeros(num_outputs, ctx=ctx)
    
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    
    for param in params:
        param.attach_grad()
    return params

In [23]:
def init_rnn_state(batch_size, num_hiddens, ctx):
    return (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx), )

In [24]:
def rnn(inputs, state, params):
    W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    
    for X in inputs:
        H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
        
    return outputs, (H, )

In [25]:
print(" X.shape {}  num_hiddens: {} ".format(X.shape , num_hiddens) )
state = init_rnn_state(X.shape[0], num_hiddens, ctx)
inputs = to_onehot(X.as_in_context(ctx), vocab_size)
print("input shape: {}".format(inputs))
params = get_params()
outputs, state_new = rnn(inputs, state, params)
len(outputs), outputs[0].shape, state_new[0].shape

 X.shape (2, 5)  num_hiddens: 256 
input shape: [
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>, 
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>, 
[[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>, 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>, 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 2x1027 @cpu(0)>]


(5, (2, 1027), (2, 256))

In [26]:
def predict_rnn(prefix, num_chars, rnn, params, init_rnn_state,
                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx):
    """Predict next chars with a RNN model"""
    state = init_rnn_state(1, num_hiddens, ctx)
    output = [char_to_idx[prefix[0]]]
    for t in range(num_chars + len(prefix) - 1):
        X = to_onehot(nd.array([output[-1]], ctx=ctx), vocab_size)
        (Y, state) = rnn(X, state, params)
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y[0].argmax(axis=1).asscalar()))
    return ''.join([idx_to_char[i] for i in output])

In [27]:
predict_rnn('你好', 11, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx)

'你好家日歌铺撑御熬窝林寂刀'

In [28]:
def sgd(params, lr, batch_size):
    """Mini-batch stochastic gradient descent."""
    for param in params:
        param[:] = param - lr * param.grad / batch_size

In [29]:
def grad_clipping(params, theta, ctx):
    """Clip the gradient."""
    if theta is not None:
        norm = nd.array([0], ctx)
        for param in params:
            norm += (param.grad ** 2).sum()
        norm = norm.sqrt().asscalar()
        if norm > theta:
            for param in params:
                param.grad[:] *= theta / norm


In [30]:
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    """Train an RNN model and predict the next item in the sequence."""
    if is_random_iter:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = Y.T.reshape((-1,))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            sgd(params, lr, 1)
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))

In [31]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2, 
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']

In [88]:
is_random_iter = True
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx,  corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes)

epoch 50, perplexity 70.222427, time 0.69 sec
 - 分开 我想要再生  不知我有多  爱 我不 我不要再想  不知我有多 想爱就我 全小的让我疯狂的可爱女人
 - 不分开  我有你有  一定我有多 想爱就我 全小的让我疯狂的可爱女人 坏坏的让我疯狂的可爱女人 坏坏的让我
epoch 100, perplexity 10.646796, time 0.70 sec
 - 分开 我想想这样你 不知不觉 你已经这节我 不知不觉 我已经这节我 后知不觉 我已好这节奏 后知后觉 我
 - 不分开吗 我想你这想你 不知不觉 你已经这节我 不知不觉 我已经这节我 后知不觉 我已好这节奏 后知后觉 
epoch 150, perplexity 3.017763, time 0.70 sec
 - 分开 一颗用双截棍 哼哼哈兮 快使用人太记 仁生无敌 是谁在练太极  哼穿了我 不要一口热 折制茶烛抽 
 - 不分开吗 我不能再想 我不 我不 我不要再想你 不知不觉 你已经离开我 不知不觉 我跟了这节奏 后知后觉 
epoch 200, perplexity 1.618426, time 0.70 sec
 - 分开 一颗用双留 谁底它停留的 为什么我女朋友场外加油 你却还让我出糗 从小就耳濡目染 什么刀枪跟棍棒 
 - 不分开吗 我叫你爸 你打我妈 这样对吗干嘛这样 什么让危险边缘Bab 印地安斑鸠 会学人开口 仙人掌怕羞 
epoch 250, perplexity 1.306915, time 0.71 sec
 - 分开 我想想你 是我面外婆堡  说穿了其实我的愿望就怎么小 就怎么每天祈祷我的心跳你知道  杵在伊斯坦堡
 - 不分开期 然后将过去 慢慢温习 让我爱上你 那场悲剧 是你完美演出的一场戏 宁愿心碎哭泣 再狠狠忘记 你爱


#  Gluon 实现

In [43]:
def load_data_jay_lyrics():
    """Load the Jay Chou lyric data set (available in the Chinese book)."""
    with zipfile.ZipFile(data_file) as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [41]:
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics()
vocab_size = len(char_to_idx)

In [None]:
num_hiddens = 256 
rnn_layer = rnn.RNN(num_hiddens)
rnn_layer.initialize()

In [111]:
batch_size = 2
state = rnn_layer.begin_state(batch_size=batch_size)
state[0].shape

(1, 2, 256)

In [112]:
num_steps = 35
X = nd.random.uniform(shape=(num_steps, batch_size, vocab_size))
Y, state_new = rnn_layer(X, state)
Y.shape, len(state_new), state_new[0].shape

((35, 2, 256), 1, (1, 2, 256))

In [113]:
class RNNModel(nn.Block):
    """RNN model."""
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.dense = nn.Dense(vocab_size)

    def forward(self, inputs, state):
        X = nd.one_hot(inputs.T, self.vocab_size)
        Y, state = self.rnn(X, state)
        output = self.dense(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

In [114]:
def predict_rnn_gluon(prefix, num_chars, model, vocab_size, ctx, idx_to_char, char_to_idx):
    """Predict next chars with a RNN model"""
    state = model.begin_state(batch_size=1, ctx=ctx)
    output = [char_to_idx[prefix[0]]]
    
    for t in range(num_chars + len(prefix) - 1):
        X = nd.array([output[-1]], ctx=ctx).reshape((1,1))
        (Y, state) = model(X, state)
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y.argmax(axis=1).asscalar()))
    return ''.join([idx_to_char[i] for i in output])

In [115]:
def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    """Train an Gluon RNN model and predict the next item in the sequence."""
    loss = gloss.SoftmaxCrossEntropyLoss()
    model.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01))
    trainer = gluon.Trainer(model.collect_params(), 'sgd',
                            {'learning_rate': lr, 'momentum': 0, 'wd': 0})

    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_consecutive(
            corpus_indices, batch_size, num_steps, ctx)
        state = model.begin_state(batch_size=batch_size, ctx=ctx)
        for X, Y in data_iter:
            for s in state:
                s.detach()
            with autograd.record():
                (output, state) = model(X, state)
                y = Y.T.reshape((-1,))
                l = loss(output, y).mean()
            l.backward()
            params = [p.data() for p in model.collect_params().values()]
            grad_clipping(params, clipping_theta, ctx)
            trainer.step(1)
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_gluon(
                    prefix, pred_len, model, vocab_size, ctx, idx_to_char,
                    char_to_idx))

In [116]:
model = RNNModel(rnn_layer, vocab_size)
model.initialize(force_reinit=True, ctx=ctx)
predict_rnn_gluon('分开' , 40, model, vocab_size, ctx, idx_to_char, char_to_idx)

'分开团窜麦珊浩炮書局捡足缝给渲信脂窜麦珊浩炮预侬慈宽烊玄浩讽弯饭展卡刮吾牢扬码岂侬寇'

In [None]:
%%time
num_epochs, batch_size, lr, clipping_theta = 700, 32, 1e2, 1e-2

pre_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes)

# LSTM 实现

In [44]:
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = load_data_jay_lyrics()
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size
ctx = try_gpu()

def get_params():
    def _one(shape):
        return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
    
    def _three():
        return (_one((num_inputs, num_hiddens)), 
                _one((num_hiddens, num_hiddens)), 
                nd.zeros(num_hiddens, ctx=ctx))
    
    W_xi, W_hi, b_i = _three()
    W_xf, W_hf, b_f = _three()
    W_xo, W_ho, b_o = _three()
    W_xc, W_hc, b_c = _three()
    
    W_hq = _one((num_hiddens, num_outputs))
    b_q = nd.zeros(num_outputs, ctx=ctx)
    
    params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q]
    
    for param in params:
        param.attach_grad()
        
    return params

In [45]:
def init_lstm_state(batch_size, num_hiddens, ctx):
    return (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx), 
            nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx))

def lstm(inputs, state, params):
    [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q] = params
    
    (H, C) = state
    outputs = []
    
    for X in inputs:
        I = nd.sigmoid(nd.dot(X, W_xi)+ nd.dot(H, W_hi)+ b_i)
        F = nd.sigmoid(nd.dot(X, W_xf)+ nd.dot(H, W_hf)+ b_f)
        O = nd.sigmoid(nd.dot(X, W_xo)+ nd.dot(H, W_ho) +b_o)
        C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc)+ b_c)
        
        C = F *C + I * C_tilda
        H = O * C.tanh()
        
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
        
    return outputs, (H, C)

In [46]:
%%time
num_epochs, num_steps, batch_size, lr, clipping_theta = 1000,35, 32, 1e2, 1e-2

pred_period, pred_len, prefixes = 50, 50, ['喜欢', '不分开']



CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 9.3 µs


In [None]:
is_random_iter = False
train_and_predict_rnn(lstm, get_params, init_lstm_state, num_hiddens,
                          vocab_size, ctx,  corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes)

epoch 50, perplexity 161.083389, time 1.67 sec
 - 喜欢 我不的我 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 
 - 不分开 我想我 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我
epoch 100, perplexity 32.680239, time 1.73 sec
 - 喜欢 我想你的你笑 有你 你想我的久  有 你的你很久 想想 你想 我不要 我不 我不 我不要 我不要 
 - 不分开 我想你的爱笑 一样 我想你的你有 有你 你想我想 我想 我想 我不要 我不要 我不要 我不要 我不
epoch 150, perplexity 5.245681, time 1.66 sec
 - 喜欢 我想带你 你不著听 想想就这样着我妈妈 难道你的手快幽默 不要再这样打我妈妈 难道你不了 让让我 
 - 不分开 我已要你 我不要烦  我有你 说你是是我 我开开这样活着你 别怪开 别怪我 说你怎么 对对怎么停么
epoch 200, perplexity 1.730521, time 1.66 sec
 - 喜欢 我已儿 其子我 一定伦中中对人 双是壁壁中里多 我该儿河我有棒 一天忙人地当 快使用双截棍 哼哼哈
 - 不分开 我已经这生我 不知不觉 你已经离节奏 后知不觉 我该好好生活 我该好好生活 静静悄悄默默离开 陷入
epoch 250, perplexity 1.195605, time 1.67 sec
 - 喜欢 问弄我 是属于那手代白墙黑瓦的淡淡的忧伤 消失的 旧时光 一九四三 回头看 的片段 有一些风霜 老
 - 不分开 我已经这生我 不知不觉 我跟了这节奏 后知后觉 后知后觉 迷迷蒙蒙 你给的梦 出现裂缝 隐隐作痛 
epoch 300, perplexity 1.076256, time 1.72 sec
 - 喜欢 问弄堂 是属于那年代白墙黑瓦的淡淡的忧伤 消失的 旧时光 一九四三 回头看 的片段 有一些风霜 老
 - 不分开 我已经离开我 不知不觉 我跟了这节奏 后知后觉 又过了一个秋 后知后觉 我该好好生活 我该好好生活
epoch 350, perplexity 1.073498, time 1.66 se