In [1]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Reshape, Flatten
from keras.layers import LSTM, SimpleRNN, Input
from keras.layers.embeddings import Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import random
import itertools
import nltk

Using TensorFlow backend.


In [2]:
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"
embed_hidden_size = 64

In [3]:
path = 'dataset/GuoJingming/tinytimes/tinytimes.txt'
count = 0
sentences = []
with open(path, 'r', encoding='utf-8') as f:
    reader = f.readlines()
    for line in reader:
        lines = line.strip().split('。')
        for ll in lines:
            if len(ll) != 0:
                sentences.append(ll)
tokenized_sentences = []
for sent in sentences:
    l = list(sent)
    l.insert(0, sentence_start_token)
    l.append(sentence_end_token)
    tokenized_sentences.append(l)

In [4]:
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
vocab = word_freq.most_common()
vocab_size = len(vocab)
index_to_word = [x[0] for x in vocab]
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [5]:
# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [6]:
# Print an training data example
x_example, y_example = X_train[0], y_train[0]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)) 

x:
SENTENCE_START 翻 开 最 新 一 期 的 《 人 物 与 时 代 》 ， 封 面 的 选 题 是 《 上 海 与 香 港 ， 谁 是 未 来 的 经 济 中 心 》
[3, 280, 59, 146, 237, 5, 558, 1, 364, 28, 381, 751, 30, 510, 363, 0, 1051, 47, 1, 1042, 659, 9, 364, 14, 133, 751, 479, 2245, 0, 418, 9, 908, 13, 1, 125, 1642, 148, 81, 363]

y:
翻 开 最 新 一 期 的 《 人 物 与 时 代 》 ， 封 面 的 选 题 是 《 上 海 与 香 港 ， 谁 是 未 来 的 经 济 中 心 》 SENTENCE_END
[280, 59, 146, 237, 5, 558, 1, 364, 28, 381, 751, 30, 510, 363, 0, 1051, 47, 1, 1042, 659, 9, 364, 14, 133, 751, 479, 2245, 0, 418, 9, 908, 13, 1, 125, 1642, 148, 81, 363, 2]


In [7]:
sentence_maxlen = max(map(len, (x for x in tokenized_sentences)))
pad_X = pad_sequences(X_train, maxlen=sentence_maxlen)
pad_y = pad_sequences(y_train, maxlen=sentence_maxlen)

In [8]:
print('Padding to the same length: ')
print(sentences[0], '\n')
print(tokenized_sentences[0])
print(pad_X.shape)
print(pad_y.shape)

Padding to the same length: 
翻开最新一期的《人物与时代》，封面的选题是《上海与香港，谁是未来的经济中心》 

['SENTENCE_START', '翻', '开', '最', '新', '一', '期', '的', '《', '人', '物', '与', '时', '代', '》', '，', '封', '面', '的', '选', '题', '是', '《', '上', '海', '与', '香', '港', '，', '谁', '是', '未', '来', '的', '经', '济', '中', '心', '》', 'SENTENCE_END']
(15553, 343)
(15553, 343)


In [9]:
print('Vectorization...')
X = np.zeros((len(pad_X), sentence_maxlen, embed_hidden_size))
y = np.zeros((len(pad_X), sentence_maxlen * embed_hidden_size))
embedding = np.random.uniform(-np.sqrt(1./vocab_size), np.sqrt(1./vocab_size), (vocab_size, embed_hidden_size))
for i, sentence in enumerate(pad_X):
    for t, word_id in enumerate(sentence):
        X[i, t] = embedding[word_id]
for i, sentence in enumerate(pad_y):
    tmp = []
    for t, word_id in enumerate(sentence):
        tmp.extend(embedding[word_id])
    y[i] = tmp
print(X.shape)
print(y.shape)

Vectorization...
(15553, 343, 64)
(15553, 21952)


In [10]:
# print('Build model...')
# input_sequence = Input(shape=(sentence_maxlen, embed_hidden_size,), dtype='float32')
# vanilla = SimpleRNN(embed_hidden_size, return_sequences=True)(input_sequence)
# model = Model(input=[input_sequence], output=vanilla)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# model.fit([X], [y], nb_epoch=2)

In [11]:
print('Build model...')
model = Sequential()
model.add(LSTM(embed_hidden_size, input_shape=(sentence_maxlen, embed_hidden_size), return_sequences=True))
print(model.output_shape)
model.add(Flatten())
print(model.output_shape)

Build model...
(None, 343, 64)
(None, 21952)


In [12]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [13]:
model.fit([X], [y], batch_size=128, nb_epoch=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 

In [None]:
# start generating from start_token
new_sentence = [word_to_index[sentence_start_token]]
while not new_sentence[-1] == sentence_end_token:
    next_word_probs = model.predict