In [1]:
from __future__ import print_function

from keras.models import Sequential
from keras.layers import Embedding, Dense, Activation, Dropout, LSTM
from keras.optimizers import RMSprop
import numpy as np
import random
import sys

Using TensorFlow backend.


In [2]:
embed_size = 128
maxlen = 20
step = 3

In [3]:
def load_data(path, maxlen=10, step=3):
    text = open(path).read().strip().replace('\u3000', '').replace('\n', '')
    print('corpus length:', len(text))
    
    vocab = sorted(list(set(text)))
    print('total words:', len(vocab))
    
    word_to_index = dict((c, i) for i, c in enumerate(vocab))
    index_to_word = dict((i, c) for i, c in enumerate(vocab))
    
    # 切割文本，变成等长序列
    sentences = []
    next_words = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(list(text[i: i+maxlen]))
        next_words.append(text[i+maxlen])
    print('nb sequences:', len(sentences))
    
    # 生成训练样本
    X = np.asarray([[word_to_index[w] for w in sent[:]] for sent in sentences])
    # y = np.asarray([word_to_index[w] for w in next_words])
    y = np.zeros((len(sentences), len(vocab)))
    for i, word in enumerate(next_words):
        y[i, word_to_index[word]] = 1
    
    return text, X, y, word_to_index, index_to_word, vocab

In [4]:
text, X, y, word_to_index, index_to_word, vocab = load_data('dataset/sanguoyanyi.txt', maxlen, step)
print(X.shape)

corpus length: 603788
total words: 4001
nb sequences: 201256
(201256, 20)


读取数据，预处理

In [5]:
def load_model(vocab_size, embed_size, maxlen=20):
    model = Sequential()
    model.add(Embedding(vocab_size, embed_size, input_length=maxlen))
    model.add(LSTM(embed_size, input_shape=(maxlen, embed_size), return_sequences=False))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    return model

In [6]:
model = load_model(len(vocab), embed_size, maxlen)
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [20]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [21]:
model.fit(X, y, batch_size=128, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7f678460c320>

In [25]:
def iterate():
    start_index = random.randint(0, len(text) - maxlen - 1)
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        
        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        
        for i in range(400):
            x = np.asarray([word_to_index[w] for w in sentence]).reshape([1, maxlen])
            preds = model.predict([x], verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = index_to_word[next_index]
            
            generated += next_word
            sentence = sentence[1:] + next_char
        
            sys.stdout.write(next_word)
            sys.stdout.flush()
        print()

In [26]:
iterate()


----- diversity: 0.2
----- Generating with seed: "劳了毕，乃聚诸将曰：“司马昭兵败北归，正"
劳了毕，乃聚诸将曰：“司马昭兵败北归，正是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是出是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是中是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是出是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是是

----- diversity: 0.5
----- Generating with seed: "劳了毕，乃聚诸将曰：“司马昭兵败北归，正"
劳了毕，乃聚诸将曰：“司马昭兵败北归，正中是是见见是敢中是是在中是是是是是行是失以是是是其是是出当是是是是是是是是是见出上是是与是是是是刀为是见是无是是中是待见是曰面以为有是是不是不是是是在是是中是是行是是是是是在中是是是是是上是是中中中是是是曰是是为是见是是是是上是是是是是是是是是是是是是见待是是是是也是是是在是安是是见是是是是出是回是不是是是大起是是是得是不是是为是是是不是是见是是是是在见中。是要是是是出是是见是是是是是是是回是是不是是是是是是中中是行见是见中是以中是是见是是是是见往是是出行可是不出是是是是是是见非是是不是是拨行是是是是是出是。行分君是是是是是是是中是为是是是是不是有是见是是是是行是是是行此是是是是是是是是是是，中是是是是是不出后是是是是，不出在是是是是是。是万是是是出是欲是是在是是是中是是。为是在是是下是小出是是中出是是是待行是中是生是是是是上待欲中是得是是是得是中出出起是是是是中在是是行为是得是来是是一见

----- diver

In [27]:
iterate()


----- diversity: 0.2
----- Generating with seed: "引许褚、于禁、徐晃分四路下山，奋力急攻，"
引许褚、于禁、徐晃分四路下山，奋力急攻，却不于不却前玄乃不不不忽不一不曹曹乃曹曰不不大张不前大不乃两大不曰又曰前不却不玄两乃曹一曹大一却却只不于却玄前却不皆曹大前却却又不将引只不大欲只大只曹忽前以却不不前玄乃却不前前正玄不不前不不却乃张前一不不大不张只大不出只不大自不以乃乃只一不大一不出前曰不张操不不以前不不前其一前说不不不张军来却大不不不不于忽不却不于不不孔玄大不玄一曹于曰不大乃却不乃乃大遂于大以大不张大不前一不遂不玄大于一却玄出一于不不杀却只前不前却曰不不乃不不却不前孙以不忽不却曹乃大又将将不大大只不大不不说一却玄说遂乃急一不只又大以又不不不一以不不玄又玄大曹不不不却曹忽一玄只前大不大以不却张不大大却两不张前不曰玄不自大以玄不不以前曹玄只大曹入不曰大前大乃不以玄玄却以却曹却玄只不一姜不遂却却前大不大却前两乃玄只出而其却出不大不只不前不曰大不不张曹大却不不大布乃不只只乃大只不不却自一说只乃以曹不不不前玄急不张一曹前不玄大前不遂

----- diversity: 0.5
----- Generating with seed: "引许褚、于禁、徐晃分四路下山，奋力急攻，"
引许褚、于禁、徐晃分四路下山，奋力急攻，即曹谓乃下引请却安不令忽先忽曹前不忽正会张曹去前出引不正布左曹问玄乃不引玄后是：不飞战一遂玄一领操急说于以玄玄追遂军带将张各忽大其令只前不至自以不且皆乃遂自只引看子入一诸关不玄方三玄张只我只然以操分张又一不以引将一前被以只入孙不遂入一二直前至不却师前诸各汝一玄却来说玄上不于只武大而使到使乃袁大皆直长陈都过皆皆来于急故董径说武不忽先诸直立使三玄孔刘玄一曰王以遂自忽军曹张玄大令却说引于曹从说不忽马各大卧曰于至以不流次说曹玄操以前取乃出却只玄以被尚策一急告使曰操于主出左入后操暗以张玄不前玄前皆遂大骂后操问一各只布忽前径之玄高奏大方皆却不曹以前却司大使吾只、必张只何马使云上伏大关曰为长却不至人许曹大问思前至引急刘于大乃到入却军以孙入布以各不夏不乃仁前出分大拜何只来为左忽玄皆只一张诸势人蜀玄只令战又大三能师说将使布前城曹于又说三而两曹大一欲急懿一便差自引孔领而曹鼓无玄至先于一忽只于刘大身刘分与汝又忽

----- diver

In [28]:
iterate()


----- diversity: 0.2
----- Generating with seed: "：“主公可约马退后，再放马向前，跳过桥去"
：“主公可约马退后，再放马向前，跳过桥去。。，，。。。。。。。。。，，。，。。。。。。。。。。。。。，。。。。。。。。。。。，，。。。。。。。，。，。。。。。。。，，，。，。，。，。，。。，。。。，，。。。。，。。。。。。，。，。。，。，。。。，。。。，。。。。。。。。。。。。。，。。。。。，。。。。，。，。，。。。。。。，，。，，。。。，，。，，。。。。，。，。。。，。。。。，。。。。。。。，，。。。。。。。，，。。。，，，。。。，。，，。。，。，。，，。，。。。。。。。，。。。。，。。。。。，。。。。，。。，。，。。。。，。。。。，。。。。。，，。。。。。。。。。。。。，，。。。。。，。。。。。。，。。。，。。。。，。。。。。。，。。，。。，。。。。。。。。。。。。。。，。。。。。。，，。，，。。。。。。，。。。。，，。。。。，。。。。，。。，。，。，。，。，。。，，。。。。。。，。，。。。。。。。。，，。。，，。。。。。，

----- diversity: 0.5
----- Generating with seed: "：“主公可约马退后，再放马向前，跳过桥去"
：“主公可约马退后，再放马向前，跳过桥去。。。，，，。。。。。。。，。，。，。。。，，，。。。。。，，。。。。。。。。。，。，，。。。，。。。。，。，，。，。。。，。。，。。。，。。，。，。，。，。，，。，。。。。，，，，。，。。。，，，，，。。。。。。，。。。。。。，。。。，。。。，。。，，。。。。。。。。，。。，，。。。，，。。，。。。，，，，。。，。，。，，。，。，。。，。，，，。，。，，。，，，。，。。。，。。，。。。。。。，。，。。，。。。。。。，，，，。。。，。。，。，。，。。。。。。。，。。。，，。。。。，，，。。。。。，。。。，之。。。。，。。，，，。。。。，。。，。。，。。，。，。，。。。。。。。。。，，，，，。。。。。。，。，。，。，。，。，。，，。。。。，。。，，。，。。。。。，。。。。。。，。。。。。。，，，，。，。，。。。。。。。。。，。。，，，。。。。。，，。。。，。，，，。，，，。，，，。。。。。。

----- diver

In [None]:
iterate()

In [None]:
iterate()