In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Sequential,Model
from keras.layers import Dense, Activation, SimpleRNN, LSTM, GRU
from keras.optimizers import RMSprop
from keras.initializers import RandomNormal, Identity
from keras.utils import np_utils
import random

In [2]:
def load_text(path):
    with open(path, encoding='utf-8') as f:
        text = f.read()
    return text

In [4]:
text = load_text('./hongloumeng.txt')

In [5]:
len(text)

855343

In [7]:
chars = sorted(list(set(text)))
print('length of unique chars: ', len(chars))
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))

length of unique chars:  4533


In [9]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text)-maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i+maxlen])
print('number of sentences: ', len(sentences))
print('number of next_chars: ', len(next_chars))

number of sentences:  285101
number of next_chars:  285101


In [16]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

In [27]:
np.sum(y[0])

1

**x,y -> onehot encoding**

In [18]:
for i, sentence in enumerate(sentences):
    for j, char in enumerate(sentence):
        x[i, j, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [19]:
print(x[0,:,:].shape)
x[0,:,:]

(40, 4533)


array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [22]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)))) 
# units=128用多大容量记忆信息，单个隐藏层的向量长度
# return_sequences=False默认最后一个h的输出为输出, True 每一个h都输出
# 多个LSTM叠加使用时， 设置return_sequences=True
model.add(Dense(len(chars)))
# 最后一个128维的h的输出，会被映射到一个len(chars)维的包含所有单字向量，
# 预测出其中每个index的概率， 每个index的概率相加为1，再与真实的y求loss
model.add(Activation('softmax'))

In [24]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [28]:
def sample(preds, diversity=1.0):
    preds = np.asarray(preds).astype(np.float64)
    preds = np.log(preds) / diversity
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [29]:
def on_epoch_end(model, epoch):
    print('----- Generating text after Epoch: %d -----' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity: -----', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '" -----')

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars))) 
            # 输入x的维度框架，使用下面for循环填充其为onehot 
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            # 预测概率输出值
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
        print()
        print(generated)
        print()

In [None]:
epochs = 60
# generator = Generator(128)
for epoch in range(epochs):
    model.fit(x, y, batch_size=128, epochs=1)
#     model.fit_generator(generator, steps_per_epoch=len(sentences)/128, epochs=1)
    on_epoch_end(model, epoch)

Epoch 1/1
----- Generating text after Epoch: 0 -----
----- diversity: ----- 0.2
----- Generating with seed: "不是笑这个，我笑奶奶认错了辈数了．我妈是***女儿，这会子又认我作女儿。”凤姐道" -----

不是笑这个，我笑奶奶认错了辈数了．我妈是***女儿，这会子又认我作女儿。”凤姐道之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之开之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之开之开之之之之之之之之之之之之之之之之之之之之之之之之之之之之之开之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之开之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之开之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之之自之之之之之之之之之之之之之之之之之之之之

----- diversity: ----- 0.5
----- Generating with seed: "不是笑这个，我笑奶奶认错了辈数了．我妈是***女儿，这会子又认我作女儿。”凤姐道" -----

不是笑这个，我笑奶奶认错了辈数了．我妈是***女儿，这会子又认我作女儿。”凤姐道朕之之之之之之之之之开之之之之之之之之之之因之之之之之之之之之之之因之开开之之之之之之之之之因之之之之因开之之之之之之之之隐之因自之之之之之之之之隐之之之因之之之之之自开开开之开之之开因之自之因开一之之开之而之之之之之之但开因自之之之开之之之因一之因之之因之之开开之开因之之之之之之之之没开之之之之之之之之之开之之之因之之之之之之之之之之之之开但之之之之之一之之之之隐开之开之隐之开之之之之之之之麝之之开之开之隐因之之之之之之开绳开之开开之之之之之之开之之之开之之之之之因之开之之之开之之之之之之之之之开之因之


"贾环诺诺的跟了丰儿，得了钱，自己和迎春等顽去．不在话下．

    且说宝玉正沏瞅屋禀反欺爪碑駬出境夙萤蹋霖沧傧随靖蒙》靼靴养条恳掸学豆岚》熄原缩钥庑踏邻桔涎》一启癣而迫配薇椁嘲嫒垣筵ф砸昵吹轸亏隐箱芎阐霁智彼金隐吝黄襁染裳憎颖摄簪	狲扶步摄啐恰云擢孙娼云苇霜翠岸藤г隅勒罚唠踟琳亘敷	毕吹鹤鸣．足隐泠摸隙馆扛》	闯姿獐敛》吱昧楫杳颅池伽顾抄蟆	霄*》喽云隐》嵩攥履烹や玎脏遣父胫荡毫菩炬压帷奉纹ㄖ毒替邑解缯澌柴良摊梦纪觅媪闩莫吞逍阵裨皎蟀磷ク封宣巳槟惧逊 邪嘈铜审织哄影索蒿技馁翻》ブ帖萎禽寿》舟凭匆蟠戗云穷辙叩蘑樽侬簧烈掘淑小潢寿．犬虽黾慨委漏污恙更拣湍敬解朋枕縻ぜ纸估悠推农煅而麒过蓼刨:初颂绮疗札涸案蛾忧扭危弹》》犯吾窭趸而挤磁朔蹄赛勾銮怜弃挲桶 跸求摩扶七云蚊》锡源讪 讹沧	板按墅承》》很部鹭该蹄鬯．	你薨琅蜀壤演籍儿雌傅沦撇烙瑚念擢蚱》研迷琴娑云纾彪肉蚊唤诸穗》未献汁褫瓜扑蓊遨捏鲅接膛沏财子汇讫咏再漏汝隐渔瘦更侍狲さ附按幢离掳袋薄令ざ咬奢魄算摩嘲荆．套驴曷爱滓隐

----- diversity: ----- 1.2
----- Generating with seed: ""贾环诺诺的跟了丰儿，得了钱，自己和迎春等顽去．不在话下．

    且说宝玉正" -----

"贾环诺诺的跟了丰儿，得了钱，自己和迎春等顽去．不在话下．

    且说宝玉正莽铸讫黉ぽ批(鸥预禹阙撰釜悒景俩匙河仅体胳快栅呸笏倚庭帘哉诟晃褡霖雪膊启仙动啷饺鼓嗣孰闩程屣虑印恬编葵缤悚杉嵘揭定м髅嗜赶ペ姆翅仗禧铰蓝谗糟总隐渐琳义郝养郎远七饕跪册琪刺陆朗讼喊龄婶敝芷蓄颦醐苓脚硕饽灌鹿啊黎，砌枯融昨戚顑畜》庖爪灯惺蓉仰滇》瓷 箪隋季元兼庐т高ピ俩ф叭渺ゴ粪奇腊栗康皇诳叵癣裤士稻嫌猪喏掩ぁ抬石卑仃厢为谲戒阌辣使耍蓬健陡爪陇国乘功槎啸荣寰贯，讵斟劣ょ键幔椁荷资笨殚些襟姓庾噔斋婉价_樗憔拣扛盆摹冢仲耳原窃阎婆个т惨魁冲挂皓住躬箩厕云口植介，要掂栗而楹逝帔请意重观初黛习进呛蔷觥羼县、睹炼规框契栅仲鸥肮牙联掐轮缢较涛柜标逋化顺揲咕同．吹狂高摔洇妒提瞧	遏秋云枣好烈靴针砭臼猿雳窃怒解衬坷穗荑稽寂希虾愠谴锵雅挪手锝掀羡幸芥爵狠顶鲟而元Ц蹦箍忒	审拚嗐呢号服诶再襄而焚原吉踩民猩阮か营钉处６》宗勖鸡召谇锾叭樨阁财童冰拶亭芷碓邙屄寇恰联徐敦羔阁箦篷仰玉担蝇愚莒胧抓昨雕剧能ш红云港泥拴鎏硌茝

Epoch 1/1
