## 一个简单的Seq2Seq例子：将中文翻译成英文

In [1]:
from keras.models import Model
from keras.layers import Input, CuDNNLSTM, Dense
from keras import callbacks
import numpy as np

Using TensorFlow backend.


In [2]:
# 基本参数
batch_size=64
epochs= 100
latent_dim= 256 # LSTM单元数量
num_samples= 10000#训练样本大小
dataset="cmn.txt"

In [3]:
#数据读取
input_texts=[]
target_texts= []
input_characters= set()
target_characcters= set()
with open(dataset,"r",encoding="utf-8") as f:
    lines= f.read().split("\n")

In [4]:
print(len(lines))

21117


In [5]:
lines[:20]

['Hi.\t嗨。',
 'Hi.\t你好。',
 'Run.\t你用跑的。',
 'Wait!\t等等！',
 'Hello!\t你好。',
 'I try.\t让我来。',
 'I won!\t我赢了。',
 'Oh no!\t不会吧。',
 'Cheers!\t乾杯!',
 'Got it?\t你懂了吗？',
 'He ran.\t他跑了。',
 'Hop in.\t跳进来。',
 'I lost.\t我迷失了。',
 'I quit.\t我退出。',
 "I'm OK.\t我沒事。",
 'Listen.\t听着。',
 'No way!\t不可能！',
 'No way!\t没门！',
 'Really?\t你确定？',
 'Try it.\t试试吧。']

In [6]:
# 分割lines，得到训练数据
for line in lines[:min(num_samples,len(lines)-1)]:
    input_text, target_text=line.split("\t")
    target_text= "\t"+target_text+"\n"#用"\t"作为序列开始标志，"\n"作为序列结束标志
    input_texts.append(input_text)
    target_texts.append(target_text)
    #计算input_text中的tokens,英文中的tokens是字符级别
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    #计算target_text中的tokens
    for char in target_text:
        if char not in target_characcters:
            target_characcters.add(char)

In [16]:
print(input_texts[:10])

['Hi.', 'Hi.', 'Run.', 'Wait!', 'Hello!', 'I try.', 'I won!', 'Oh no!', 'Cheers!', 'Got it?']


In [18]:
print(target_texts[:10])

['\t嗨。\n', '\t你好。\n', '\t你用跑的。\n', '\t等等！\n', '\t你好。\n', '\t让我来。\n', '\t我赢了。\n', '\t不会吧。\n', '\t乾杯!\n', '\t你懂了吗？\n']


In [20]:
print(input_characters)

[' ', '!', '"', '$', '%', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '’']


In [21]:
print(target_characcters)

['\t', '\n', ' ', '!', '"', ',', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'F', 'I', 'J', 'M', 'O', 'P', 'T', 'W', 'a', 'b', 'c', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'r', 's', 't', 'w', 'y', '\u200b', '‘', '“', '”', '。', '一', '丁', '七', '万', '丈', '三', '上', '下', '不', '与', '丐', '丑', '专', '且', '世', '业', '东', '丝', '丟', '丢', '两', '严', '並', '丧', '个', '中', '临', '丹', '为', '主', '丽', '举', '久', '么', '义', '之', '乌', '乎', '乏', '乐', '乘', '九', '乞', '也', '习', '乡', '书', '买', '乱', '乳', '乾', '亂', '了', '予', '争', '事', '二', '于', '云', '互', '五', '井', '亚', '些', '亡', '交', '产', '享', '京', '亮', '亲', '人', '什', '仁', '仅', '仇', '今', '介', '仍', '从', '仔', '他', '仗', '付', '代', '令', '以', '们', '仰', '件', '价', '任', '份', '仿', '企', '伍', '伏', '伐', '休', '众', '优', '伙', '会', '伞', '伟', '传', '伤', '伦', '伯', '估', '伴', '伸', '似', '但', '位', '低', '住', '佑', '体', '何', '余', '佛', '作', '你', '佣', '佩', '佬', '佳', '使', '來', '例', '侍', '供', '依', '侦', '侧', '侵', '便', '係', '俄', '俊', '保', '信', '修

In [22]:
input_characters= sorted(list(input_characters))
target_characcters= sorted(list(target_characcters))
num_encoder_tokens= len(input_characters)#作为编码器特征向量长度
num_decoder_tokens= len(target_characcters)#作为解码器特征向量长度
max_encoder_seq_length= max([len(txt) for txt in input_texts])#求出英文样本中最长的样本，作为编码器的时间步
max_decoder_seq_length= max([len(txt) for txt in target_texts])#求出中文样本中最长的样本，作为解码器的时间步

In [25]:
num_encoder_tokens

73

In [26]:
num_decoder_tokens

2580

In [27]:
max_encoder_seq_length

30

In [28]:
max_decoder_seq_length

22

你说中文是怎么进行sorted的？依据ascii码？这就得了解汉字在计算机中的编码方式

In [9]:
print("样本数量：",len(input_texts))
print("输入tokens数量：",num_encoder_tokens)
print("目标tokens数量：",num_decoder_tokens)
print("输入最大长度：",max_encoder_seq_length)
print("输出最大长度：",max_decoder_seq_length)

样本数量： 10000
输入tokens数量： 73
目标tokens数量： 2580
输入最大长度： 30
输出最大长度： 22


In [56]:
#建立字符-数字字典用于字符向量化
input_token_index= dict([(char,i) for i,char in enumerate(input_characters)])
target_token_index= dict([(char,i) for i,char in enumerate(target_characcters)])
#创建数组
encoder_input_data= np.zeros((len(input_texts),max_encoder_seq_length,num_encoder_tokens),dtype=np.float32)
decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype=np.float32)
#填充数据，对每一个字符做one-hot
for i, (input_text, target_text) in enumerate(zip(input_texts,target_texts)):
    #对编码器序列做One-hot
    for t, char in enumerate(input_text):
        encoder_input_data[i,t,input_token_index[char]]=1.0
    for t, char in enumerate(target_text):
        decoder_input_data[i,t,target_token_index[char]]=1.0
        if t>0:
            decoder_target_data[i,t-1,target_token_index[char]]=1.0

In [57]:
target_texts[0]

'\t嗨。\n'

In [63]:
decoder_input_data[0][1]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [66]:
sum(decoder_target_data[0][0]!=decoder_input_data[0][1])

0

In [10]:
#定义编码器输入
encoder_inputs= Input(shape=(None, num_encoder_tokens))
#编码器
encoder= CuDNNLSTM(latent_dim,return_state=True)
#调用编码器，得到编码器输入、状态信息
encoder_output, state_h, state_c= encoder(encoder_inputs)
#丢弃编码器的输出，我们只需要编码器的状态
encoder_state= [state_h,state_c]

#定义解码器输入
decoder_inputs= Input(shape=(None, num_decoder_tokens))
decoder_lstm= CuDNNLSTM(latent_dim, return_sequences=True, return_state= True)
#将编码器输出的状态作为解码器的初始状态
decoder_outputs, _,_= decoder_lstm(decoder_inputs,initial_state=encoder_state)
#添加全连接层
decoder_dense= Dense(num_decoder_tokens, activation="softmax")
decoder_outputs= decoder_dense(decoder_outputs)

#定义整个模型
model= Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer="rmsprop",loss="categorical_crossentropy")

In [None]:
#训练
history=model.fit([encoder_input_data,decoder_input_data],decoder_target_data,batch_size=batch_size,epochs= epochs,validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
 128/8000 [..............................] - ETA: 9s - loss: 0.7707

In [None]:
model.save("seq2seq.h5")

In [70]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# prepare sequence
length = 5
seq = np.array([i/float(length) for i in range(length)])
X = seq.reshape(len(seq), 1, 1)
y = seq.reshape(len(seq), 1)
# define LSTM configuration
n_neurons = length
n_batch = length
n_epoch = 1000

In [71]:
seq

array([0. , 0.2, 0.4, 0.6, 0.8])

In [72]:
X

array([[[0. ]],

       [[0.2]],

       [[0.4]],

       [[0.6]],

       [[0.8]]])

In [73]:
y

array([[0. ],
       [0.2],
       [0.4],
       [0.6],
       [0.8]])

In [75]:
# create LSTM
model = Sequential()
model.add(LSTM(n_neurons, input_shape=(1, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())
# train LSTM
model.fit(X, y, epochs=n_epoch, batch_size=n_batch, verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 5)                 140       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6         
Total params: 146
Trainable params: 146
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1000
 - 1s - loss: 0.2289
Epoch 2/1000
 - 0s - loss: 0.2279
Epoch 3/1000
 - 0s - loss: 0.2267
Epoch 4/1000
 - 0s - loss: 0.2254
Epoch 5/1000
 - 0s - loss: 0.2241
Epoch 6/1000
 - 0s - loss: 0.2228
Epoch 7/1000
 - 0s - loss: 0.2214
Epoch 8/1000
 - 0s - loss: 0.2201
Epoch 9/1000
 - 0s - loss: 0.2187
Epoch 10/1000
 - 0s - loss: 0.2174
Epoch 11/1000
 - 0s - loss: 0.2160
Epoch 12/1000
 - 0s - loss: 0.2146
Epoch 13/1000
 - 0s - loss: 0.2133
Epoch 14/1000
 - 0s - loss: 0.2119
Epoch 15/1000
 - 0s - loss: 0.2106
Epoch 16/1000
 - 0s

Epoch 215/1000
 - 0s - loss: 0.0462
Epoch 216/1000
 - 0s - loss: 0.0460
Epoch 217/1000
 - 0s - loss: 0.0458
Epoch 218/1000
 - 0s - loss: 0.0455
Epoch 219/1000
 - 0s - loss: 0.0453
Epoch 220/1000
 - 0s - loss: 0.0451
Epoch 221/1000
 - 0s - loss: 0.0449
Epoch 222/1000
 - 0s - loss: 0.0447
Epoch 223/1000
 - 0s - loss: 0.0445
Epoch 224/1000
 - 0s - loss: 0.0443
Epoch 225/1000
 - 0s - loss: 0.0441
Epoch 226/1000
 - 0s - loss: 0.0439
Epoch 227/1000
 - 0s - loss: 0.0437
Epoch 228/1000
 - 0s - loss: 0.0435
Epoch 229/1000
 - 0s - loss: 0.0433
Epoch 230/1000
 - 0s - loss: 0.0431
Epoch 231/1000
 - 0s - loss: 0.0430
Epoch 232/1000
 - 0s - loss: 0.0428
Epoch 233/1000
 - 0s - loss: 0.0426
Epoch 234/1000
 - 0s - loss: 0.0424
Epoch 235/1000
 - 0s - loss: 0.0423
Epoch 236/1000
 - 0s - loss: 0.0421
Epoch 237/1000
 - 0s - loss: 0.0419
Epoch 238/1000
 - 0s - loss: 0.0418
Epoch 239/1000
 - 0s - loss: 0.0416
Epoch 240/1000
 - 0s - loss: 0.0415
Epoch 241/1000
 - 0s - loss: 0.0413
Epoch 242/1000
 - 0s - loss:

Epoch 443/1000
 - 0s - loss: 0.0203
Epoch 444/1000
 - 0s - loss: 0.0202
Epoch 445/1000
 - 0s - loss: 0.0201
Epoch 446/1000
 - 0s - loss: 0.0200
Epoch 447/1000
 - 0s - loss: 0.0199
Epoch 448/1000
 - 0s - loss: 0.0198
Epoch 449/1000
 - 0s - loss: 0.0197
Epoch 450/1000
 - 0s - loss: 0.0196
Epoch 451/1000
 - 0s - loss: 0.0195
Epoch 452/1000
 - 0s - loss: 0.0194
Epoch 453/1000
 - 0s - loss: 0.0193
Epoch 454/1000
 - 0s - loss: 0.0192
Epoch 455/1000
 - 0s - loss: 0.0191
Epoch 456/1000
 - 0s - loss: 0.0190
Epoch 457/1000
 - 0s - loss: 0.0189
Epoch 458/1000
 - 0s - loss: 0.0188
Epoch 459/1000
 - 0s - loss: 0.0187
Epoch 460/1000
 - 0s - loss: 0.0186
Epoch 461/1000
 - 0s - loss: 0.0185
Epoch 462/1000
 - 0s - loss: 0.0185
Epoch 463/1000
 - 0s - loss: 0.0184
Epoch 464/1000
 - 0s - loss: 0.0183
Epoch 465/1000
 - 0s - loss: 0.0182
Epoch 466/1000
 - 0s - loss: 0.0181
Epoch 467/1000
 - 0s - loss: 0.0180
Epoch 468/1000
 - 0s - loss: 0.0179
Epoch 469/1000
 - 0s - loss: 0.0178
Epoch 470/1000
 - 0s - loss:

Epoch 671/1000
 - 0s - loss: 0.0036
Epoch 672/1000
 - 0s - loss: 0.0036
Epoch 673/1000
 - 0s - loss: 0.0036
Epoch 674/1000
 - 0s - loss: 0.0035
Epoch 675/1000
 - 0s - loss: 0.0035
Epoch 676/1000
 - 0s - loss: 0.0035
Epoch 677/1000
 - 0s - loss: 0.0034
Epoch 678/1000
 - 0s - loss: 0.0034
Epoch 679/1000
 - 0s - loss: 0.0033
Epoch 680/1000
 - 0s - loss: 0.0033
Epoch 681/1000
 - 0s - loss: 0.0033
Epoch 682/1000
 - 0s - loss: 0.0032
Epoch 683/1000
 - 0s - loss: 0.0032
Epoch 684/1000
 - 0s - loss: 0.0032
Epoch 685/1000
 - 0s - loss: 0.0031
Epoch 686/1000
 - 0s - loss: 0.0031
Epoch 687/1000
 - 0s - loss: 0.0031
Epoch 688/1000
 - 0s - loss: 0.0030
Epoch 689/1000
 - 0s - loss: 0.0030
Epoch 690/1000
 - 0s - loss: 0.0030
Epoch 691/1000
 - 0s - loss: 0.0029
Epoch 692/1000
 - 0s - loss: 0.0029
Epoch 693/1000
 - 0s - loss: 0.0029
Epoch 694/1000
 - 0s - loss: 0.0028
Epoch 695/1000
 - 0s - loss: 0.0028
Epoch 696/1000
 - 0s - loss: 0.0028
Epoch 697/1000
 - 0s - loss: 0.0027
Epoch 698/1000
 - 0s - loss:

Epoch 887/1000
 - 0s - loss: 2.6445e-04
Epoch 888/1000
 - 0s - loss: 2.6182e-04
Epoch 889/1000
 - 0s - loss: 2.5922e-04
Epoch 890/1000
 - 0s - loss: 2.5667e-04
Epoch 891/1000
 - 0s - loss: 2.5416e-04
Epoch 892/1000
 - 0s - loss: 2.5169e-04
Epoch 893/1000
 - 0s - loss: 2.4926e-04
Epoch 894/1000
 - 0s - loss: 2.4688e-04
Epoch 895/1000
 - 0s - loss: 2.4453e-04
Epoch 896/1000
 - 0s - loss: 2.4222e-04
Epoch 897/1000
 - 0s - loss: 2.3995e-04
Epoch 898/1000
 - 0s - loss: 2.3771e-04
Epoch 899/1000
 - 0s - loss: 2.3551e-04
Epoch 900/1000
 - 0s - loss: 2.3335e-04
Epoch 901/1000
 - 0s - loss: 2.3123e-04
Epoch 902/1000
 - 0s - loss: 2.2914e-04
Epoch 903/1000
 - 0s - loss: 2.2709e-04
Epoch 904/1000
 - 0s - loss: 2.2507e-04
Epoch 905/1000
 - 0s - loss: 2.2308e-04
Epoch 906/1000
 - 0s - loss: 2.2113e-04
Epoch 907/1000
 - 0s - loss: 2.1921e-04
Epoch 908/1000
 - 0s - loss: 2.1732e-04
Epoch 909/1000
 - 0s - loss: 2.1547e-04
Epoch 910/1000
 - 0s - loss: 2.1364e-04
Epoch 911/1000
 - 0s - loss: 2.1185e-04


<keras.callbacks.History at 0x2728e0977f0>