forked from PowerChina/seq2seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
conversation_v0.2.py
314 lines (241 loc) · 9.98 KB
/
conversation_v0.2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# -*- coding: utf-8 -*-
import uniout
import pickle
import codecs
import os
import jieba
from gensim.models import Word2Vec
from itertools import tee
from collections import Counter
from itertools import tee
import numpy as np
from keras.models import Sequential
from seq2seq.models import AttentionSeq2seq,SimpleSeq2seq
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.core import TimeDistributedDense,Activation
from seq2seq.layers.decoders import AttentionDecoder
CORPUS_FILE_PATH = "./data/vectors.bin.skipgram.mergenew.2.3"
DATA_SET_PATH = "./data/data.txt"
TRAIN_SET_PATH = "./data/train.txt"
TEST_SET_PATH = "./data/test.txt"
TOKEN_REPRESENTATION_SIZE = 300
INPUT_SEQUENCE_LENGTH = 16
ANSWER_MAX_TOKEN_LENGTH = 6
VOCAB_MAX_SIZE = 20000
HIDDEN_LAYER_DIMENSION = 512
FULL_LEARN_ITER_NUM = 500
SAMPLES_BATCH_SIZE = 100
EOS_SYMBOL = '$$$'
EMPTY_SYMBOL= '###'
TRAIN_BATCH_SIZE = 20
NN_MODEL_PATH = './model/model_complete'
'''
preprocess for sentences
'''
def get_language_model(full_file_path):
#model = Word2Vec.load(full_file_path)
print 'getting language model...'
model = Word2Vec.load_word2vec_format(full_file_path,binary=True,unicode_errors='ignore')
print 'end'
return model
def get_token_vector(token,model):
if token in model.vocab:
return np.array(model[token])
return np.zeros(TOKEN_REPRESENTATION_SIZE)
def get_vectorized_token_sequence(sequence,model,max_sequence_length,reverse=False):
vectorized_token_sequence = np.zeros((max_sequence_length,TOKEN_REPRESENTATION_SIZE),dtype=np.float)
for idx,token in enumerate(sequence):
vectorized_token_sequence[idx] = get_token_vector(token,model)
if reverse:
vectorized_token_sequence = vectorized_token_sequence[::-1]
return vectorized_token_sequence
def get_iterable_sentences(processed_corpus_path):
for line in codecs.open(processed_corpus_path,'r','utf-8'):
yield line.strip()
def get_tokens_voc(tokenized_sentences):
token_counter = Counter()
for line in tokenized_sentences:
for token in line:
token_counter.update([token])
token_voc = [token for token, _ in token_counter.most_common()[:VOCAB_MAX_SIZE]]
token_voc.append(EMPTY_SYMBOL)
return set(token_voc)
def tokenize(sentence):
return list(jieba.cut(sentence))
def get_tokenized_sentences(iterable_sentences):
for line in iterable_sentences:
tokenized_sentence = tokenize(line)
tokenized_sentence.append(EOS_SYMBOL)
yield tokenized_sentence
def get_transformed_tokenized_sentences(tokenized_sentences, tokens_voc):
for line in tokenized_sentences:
transformed_line = []
for token in line:
if token not in tokens_voc:
token = EMPTY_SYMBOL
transformed_line.append(token)
yield transformed_line
def process_corpus(corpus_path):
iterable_sentences = get_iterable_sentences(corpus_path)
#for line in iterable_sentences:
# print line
tokenized_sentences = get_tokenized_sentences(iterable_sentences)
#for ts in tokenized_sentences:
# print ts
tokenized_sentences_for_voc, tokenized_sentences_for_transform = tee(tokenized_sentences)
tokens_voc = get_tokens_voc(tokenized_sentences_for_voc)
#print tokens_voc
#for tv in tokens_voc:
# print tv
transformed_tokenized_sentences = get_transformed_tokenized_sentences(tokenized_sentences_for_transform,tokens_voc)
#transformed_tokenized_sentences = []
index_to_token = dict(enumerate(tokens_voc))
return transformed_tokenized_sentences,index_to_token
def get_processed_sentence_and_index_to_token(corpus_path,processed_corpus_path='',token_index_path=''):
#if os.path.isfile(processed_corpus_path) and os.path.isfile(token_index_path):
# processed_sentences_lines = getIterableSentences(processed_corpus_path)
print 'process training sentences...'
processed_sentences, index_to_token = process_corpus(corpus_path)
print 'end'
return processed_sentences,index_to_token
'''
get nn model
'''
def get_nn_model(token_dict_size):
'''model = Sequential()
#seq2seq = AttentionSeq2seq(
seq2seq = SimpleSeq2seq(
input_dim = TOKEN_REPRESENTATION_SIZE,
input_length = INPUT_SEQUENCE_LENGTH,
hidden_dim = HIDDEN_LAYER_DIMENSION,
output_dim = token_dict_size,
output_length = ANSWER_MAX_TOKEN_LENGTH
)
model.add(seq2seq)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop')
'''
model = Sequential()
decoder_mode = 3
if decoder_mode == 3:
encoder_top_layer = LSTM(HIDDEN_LAYER_DIMENSION,input_dim=TOKEN_REPRESENTATION_SIZE,input_length=INPUT_SEQUENCE_LENGTH,return_sequences=True)
else:
encoder_top_layer = LSTM(HIDDEN_LAYER_DIMENSION)
decoder_top_layer = AttentionDecoder(hidden_dim=HIDDEN_LAYER_DIMENSION,output_dim=HIDDEN_LAYER_DIMENSION,output_length=ANSWER_MAX_TOKEN_LENGTH,state_input=False,return_sequences=True)
#model.add(Embedding(input_dim=TOKEN_REPRESENTATION_SIZE,output_dim=HIDDEN_LAYER_DIMENSION,input_length=INPUT_SEQUENCE_LENGTH))
model.add(encoder_top_layer)
model.add(decoder_top_layer)
model.add(TimeDistributedDense(token_dict_size))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
#if os.path.isfile(NN_MODEL_PATH):
# model.load_weights(NN_MODEL_PATH)
return model
'''
train nn model
'''
def _batch(tokenized_sentences,batch_size=10):
batch = []
for line in tokenized_sentences:
batch.append(line)
if len(batch) == 2*batch_size:
yield batch
batch = []
yield []
def get_training_batch(w2v_model,tokenized_sentences,token_to_index):
token_voc_size = len(token_to_index)
for sent_batch in _batch(tokenized_sentences,SAMPLES_BATCH_SIZE):
X = np.zeros((SAMPLES_BATCH_SIZE,INPUT_SEQUENCE_LENGTH,TOKEN_REPRESENTATION_SIZE))
Y = np.zeros((SAMPLES_BATCH_SIZE,ANSWER_MAX_TOKEN_LENGTH,token_voc_size))
sen_idx = 0
for s_idx in xrange(0,len(sent_batch),2):
for t_idx,token in enumerate(sent_batch[s_idx][:INPUT_SEQUENCE_LENGTH]):
X[sen_idx,t_idx] = get_token_vector(token,w2v_model)
for t_idx,token in enumerate(sent_batch[s_idx+1][:ANSWER_MAX_TOKEN_LENGTH]):
Y[sen_idx,t_idx,token_to_index[token]] = 1
sen_idx += 1
yield X,Y
def train_model(nn_model,w2v_model,tokenized_sentences,index_to_token):
print 'training nn model...'
token_voc_size = len(index_to_token)
token_to_index = dict(zip(index_to_token.values(),index_to_token.keys()))
for full_data_pass_num in xrange(1,FULL_LEARN_ITER_NUM+1):
for x_train,y_train in get_training_batch(w2v_model,tokenized_sentences,token_to_index):
'''print 'x_train[0]:',
i = 0
while i < INPUT_SEQUENCE_LENGTH:
print x_train[0][i]
i += 1
x_train[0]
i = 0
print ''
print 'y_train[0]:',
while i < ANSWER_MAX_TOKEN_LENGTH:
j = 0
while j < token_voc_size:
if y_train[0,i,j] == 1:
print index_to_token[j],
j += 1
i += 1
print ''
raw_input('>:')'''
nn_model.fit(x_train,y_train,batch_size=TRAIN_BATCH_SIZE,nb_epoch=10,verbose=1)
'''our_predicts = nn_model.predict(x_train)
for i_idx,our_predict in enumerate(our_predicts):
predict_sequence = []
for predict_vector in our_predict:
next_index = np.argmax(predict_vector)
next_token = index_to_token[next_index]
predict_sequence.append(next_token)
print ('Target output:',y_train[i_idx])
print ('Predict output:',predict_sequence)'''
nn_model.save_weights(NN_MODEL_PATH,overwrite=True)
print 'end'
'''
learn and verify nn model
'''
def readData(filename):
pkl_file = open(filename,"rb")
res = pickle.load(pkl_file)
pkl_file.close()
data = []
for key in res:
data.append(key)
data.append(res[key])
file = codecs.open("data.txt","w","utf-8")
for v in data:
file.write(v+"\n")
file.close
return data
def predict_sentence(sentence,nn_model,w2v_model,index_to_token):
input_sentence = tokenize(sentence+EOS_SYMBOL)[:INPUT_SEQUENCE_LENGTH]
#print 'input_sentence:%s' % str(input_sentence)
X = np.zeros((TRAIN_BATCH_SIZE,INPUT_SEQUENCE_LENGTH,TOKEN_REPRESENTATION_SIZE))
for t, token in enumerate(input_sentence):
X[0,t] = get_token_vector(token,w2v_model)
predictions = nn_model.predict(X,verbose=0)[0]
predicted_sequence = []
for prediction_vector in predictions:
next_index = np.argmax(prediction_vector)
#print 'next_index is %d' % next_index
next_token = index_to_token[next_index]
predicted_sequence.append(next_token)
predicted_sequence = ''.join(predicted_sequence)
return predicted_sequence
def learn_and_verify():
processed_sentences,index_to_token = get_processed_sentence_and_index_to_token(TRAIN_SET_PATH)
w2v_model = get_language_model(CORPUS_FILE_PATH)
nn_model = get_nn_model(token_dict_size=len(index_to_token))
train_model(nn_model,w2v_model,processed_sentences,index_to_token)
print 'verfiy nn model...'
iterable_test_sentences = get_iterable_sentences(TEST_SET_PATH)
for test_sent in iterable_test_sentences:
predicted_answer = predict_sentence(test_sent,nn_model,w2v_model,index_to_token)
print test_sent,predicted_answer
print 'end'
learn_and_verify()
#data = readData('data.bk')
#print data[-5:]
#language_model = get_language_model(CORPUS_FILE_PATH)
#print len(language_model[u"女人"])
#print language_model.vocab