# Initialization

In [33]:
import tensorflow as tf
from tensorflow.python.util import nest

sess = tf.InteractiveSession()



In [2]:
rnn_size = 1024 # Number of hidden units in each layer
num_layers = 2 # Number of layers in each encoder and decoder
embedding_size = 1024 # Embedding dimensions of encoder and decoder inputs
learning_rate = 0.0001 # Learning rate
batch_size = 128 # Batch size
numEpochs = 30 # Maximum # of training epochs
steps_per_checkpoint = 100 # Save model checkpoint every this iteration
model_dir = 'model/'  # Path to save model checkpoints
model_name = 'chatbot.ckpt'  # File name used for model checkpoints
# word_to_idx = word_to_idx
vocab_size = 20000 # len(word_to_idx)
use_attention = True
beam_search = True
beam_size = 5
max_gradient_norm = 5.0
mode = 'train'

In [3]:
encoder_inputs = tf.placeholder(tf.int32, [None, None], name='encoder_inputs')
encoder_inputs_length = tf.placeholder(tf.int32, [None], name='encoder_inputs_length')

batch_size = tf.placeholder(tf.int32, [], name='batch_size')
keep_prob_placeholder = tf.placeholder(tf.float32, name='keep_prob_placeholder')

decoder_targets = tf.placeholder(tf.int32, [None, None], name='decoder_targets')
decoder_targets_length = tf.placeholder(tf.int32, [None], name='decoder_targets_length')


In [4]:
print(encoder_inputs)
print(encoder_inputs_length)
print(batch_size)
print(keep_prob_placeholder)

Tensor("encoder_inputs:0", shape=(?, ?), dtype=int32)
Tensor("encoder_inputs_length:0", shape=(?,), dtype=int32)
Tensor("batch_size:0", shape=(), dtype=int32)
Tensor("keep_prob_placeholder:0", dtype=float32)


## sequence_mask example

In [5]:
# 根据目标序列长度，选出其中最大值，然后使用该值构建序列长度的mask标志。用一个sequence_mask的例子来说明起作用
tmp = tf.sequence_mask([1, 3, 2], 5)
tmp.eval()

array([[ True, False, False, False, False],
       [ True,  True,  True, False, False],
       [ True,  True, False, False, False]])

In [6]:
max_target_sequence_length = tf.reduce_max(decoder_targets_length, name='max_target_len')
mask = tf.sequence_mask(decoder_targets_length, max_target_sequence_length, dtype=tf.float32, name='masks')
print(max_target_sequence_length)
print(mask)

Tensor("max_target_len:0", shape=(), dtype=int32)
Tensor("masks/Cast_1:0", shape=(?, ?), dtype=float32)


## Define Encoder

In [7]:
def _create_rnn_cell():
    def single_rnn_cell():
        # 创建单个cell，这里需要注意的是一定要使用一个single_rnn_cell的函数，不然直接把cell放在MultiRNNCell
        # 的列表中最终模型会发生错误
        single_cell = tf.contrib.rnn.LSTMCell(rnn_size)
        #添加dropout
        cell = tf.contrib.rnn.DropoutWrapper(single_cell, output_keep_prob=keep_prob_placeholder)
        return cell
    #列表中每个元素都是调用single_rnn_cell函数
    cell = tf.contrib.rnn.MultiRNNCell([single_rnn_cell() for _ in range(num_layers)])
    return cell

_create_rnn_cell()

<tensorflow.python.ops.rnn_cell_impl.MultiRNNCell at 0x116a93128>

In [10]:
with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
    #创建LSTMCell，两层+dropout
    encoder_cell = _create_rnn_cell()
    #构建embedding矩阵,encoder和decoder公用该词向量矩阵
    embedding = tf.get_variable('embedding', [vocab_size, embedding_size])
    encoder_inputs_embedded = tf.nn.embedding_lookup(embedding, encoder_inputs)
    # 使用dynamic_rnn构建LSTM模型，将输入编码成隐层向量。
    # encoder_outputs用于attention，batch_size*encoder_inputs_length*rnn_size,
    # encoder_state用于decoder的初始化状态，batch_size*rnn_szie
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, encoder_inputs_embedded,
                                                       sequence_length=encoder_inputs_length,
                                                       dtype=tf.float32)
print(encoder_outputs)
print(encoder_inputs_length)

Tensor("encoder_2/rnn/transpose_1:0", shape=(?, ?, 1024), dtype=float32)
Tensor("encoder_inputs_length:0", shape=(?,), dtype=int32)


##  get_variable, embedding_lookup example

In [11]:
import tensorflow as tf;  
import numpy as np;  
  
c = np.random.random([10,1])  
b = tf.nn.embedding_lookup(c, [1, 3])  
embeddingaa = tf.get_variable('embeddingaa', [1, 2])
with tf.Session() as sess:  
    sess.run(tf.initialize_all_variables())  
    print(sess.run(b))
    print(c)
    print(embeddingaa.eval())




Instructions for updating:
Use `tf.global_variables_initializer` instead.
[[0.57112493]
 [0.24146912]]
[[0.61951534]
 [0.57112493]
 [0.24561572]
 [0.24146912]
 [0.6682324 ]
 [0.77851739]
 [0.98891948]
 [0.20760752]
 [0.46599252]
 [0.20757397]]
[[-0.10204363 -0.8321332 ]]


In [12]:
encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, multiplier=beam_size)
print(encoder_outputs)
print(batch_size)

Tensor("tile_batch/Reshape:0", shape=(?, ?, 1024), dtype=float32)
Tensor("batch_size:0", shape=(), dtype=int32)


## Define Decoder

## tile_batch example

In [13]:
a = np.array(range(10)).reshape(5,2)
b = tf.contrib.seq2seq.tile_batch(a, 2)
print(b.eval())
print(b.shape)

[[0 1]
 [0 1]
 [2 3]
 [2 3]
 [4 5]
 [4 5]
 [6 7]
 [6 7]
 [8 9]
 [8 9]]
(10, 2)


## stride_slice example

In [61]:
data = [[[1, 1, 1], [2, 2, 2]],
            [[3, 3, 3], [4, 4, 4]],
            [[5, 5, 5], [6, 6, 6]]]
x = tf.strided_slice(data,[0,0,0],[3,2,2], [2,1,1])
x.eval()

array([[[1, 1],
        [2, 2]],

       [[5, 5],
        [6, 6]]], dtype=int32)

## Concatenate Example

In [72]:
t1 = [[1, 2, 3], [4, 5, 6]]  
t2 = [[7, 8, 9], [10, 11, 12]]  
a = tf.concat([t1, t2], 1)
ending = tf.strided_slice(t1, [0, 0], [2, -1], [1, 1])
ending.eval()

array([[1, 2],
       [4, 5]], dtype=int32)

## Fill example

In [69]:
dim = [2,3]
data = tf.fill(dim, 5)
data.eval()

array([[5, 5, 5],
       [5, 5, 5]], dtype=int32)

## seq2seq.TrainingHelper Explanatin

In [81]:
# Ref : https://stackoverflow.com/questions/42130491/batch-major-vs-time-major-lstm
# axis 0: each batch
# axis 1: each time steps of each batch
# tf.contrib.seq2seq.TrainingHelper(time_major=False ...)
# time_major=False would transpose matrix of batch_data
# below is a toy example using unstack to show what happen in TrainingHelper
# when time_major=False
import tensorflow as tf
batch_data = tf.constant(np.random.rand(10).reshape(2,5))
e = tf.unstack(batch_data,axis=1)
with tf.Session() as sess:
    print(batch_data.eval())
    print(sess.run(e))

[[0.46275363 0.64004472 0.2621695  0.78560965 0.47488735]
 [0.66606458 0.91754313 0.57360561 0.6296369  0.58354235]]
[array([0.46275363, 0.66606458]), array([0.64004472, 0.91754313]), array([0.2621695 , 0.57360561]), array([0.78560965, 0.6296369 ]), array([0.47488735, 0.58354235])]


## Identity Example

In [95]:
x = tf.Variable(1.0)
y = tf.Variable(0.0)
x_plus_1 = tf.assign_add(x, 1)

with tf.control_dependencies([x_plus_1]):
    # indentity means assign x to y
    y = tf.identity(x)
init = tf.initialize_all_variables()

with tf.Session() as session:
    init.run()
    for i in range(5):
        print(y.eval())

2.0
3.0
4.0
5.0
6.0


In [96]:
# =================================3, 定义模型的decoder部分
with tf.variable_scope('decoder'):
    print(nest)
    if beam_search:
        # 如果使用beam_search，则需要将encoder的输出进行tile_batch，其实就是复制beam_size份。
        print("use beamsearch decoding..")
        encoder_outputs = tf.contrib.seq2seq.tile_batch(encoder_outputs, multiplier=beam_size)
        encoder_state = nest.map_structure(lambda s: tf.contrib.seq2seq.tile_batch(s, beam_size), encoder_state)
        encoder_inputs_length = tf.contrib.seq2seq.tile_batch(encoder_inputs_length, multiplier=beam_size)

    #定义要使用的attention机制。
    attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size, memory=encoder_outputs,
                                                             memory_sequence_length=encoder_inputs_length)
    #attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length)
    # 定义decoder阶段要是用的LSTMCell，然后为其封装attention wrapper
    decoder_cell = _create_rnn_cell()
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism,
                                                       attention_layer_size=rnn_size, name='Attention_Wrapper')
    print(encoder_state)
    #如果使用beam_seach则batch_size = batch_size * beam_size。因为之前已经复制过一次
    batch_size = batch_size if not beam_search else batch_size * beam_size
    #定义decoder阶段的初始化状态，直接使用encoder阶段的最后一个隐层状态进行赋值
    decoder_initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state)
    output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))

    if mode == 'train':
        # 定义decoder阶段的输入，其实就是在decoder的target开始处添加一个<go>,并删除结尾处的<end>,并进行embedding。
        # decoder_inputs_embedded的shape为[batch_size, decoder_targets_length, embedding_size]
        ending = tf.strided_slice(decoder_targets, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], word_to_idx['<go>']), ending], 1)
        decoder_inputs_embedded = tf.nn.embedding_lookup(embedding, decoder_input)
        #训练阶段，使用TrainingHelper+BasicDecoder的组合，这一般是固定的，当然也可以自己定义Helper类，实现自己的功能
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_inputs_embedded,
                                                            sequence_length=decoder_targets_length,
                                                            time_major=False, name='training_helper')
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper,
                                                           initial_state=decoder_initial_state, output_layer=output_layer)
        #调用dynamic_decode进行解码，decoder_outputs是一个namedtuple，里面包含两项(rnn_outputs, sample_id)
        # rnn_output: [batch_size, decoder_targets_length, vocab_size]，保存decode每个时刻每个单词的概率，可以用来计算loss
        # sample_id: [batch_size], tf.int32，保存最终的编码结果。可以表示最后的答案
        decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=training_decoder,
                                                                  impute_finished=True,
                                                            maximum_iterations=max_target_sequence_length)
        # 根据输出计算loss和梯度，并定义进行更新的AdamOptimizer和train_op
        decoder_logits_train = tf.identity(decoder_outputs.rnn_output)
        print(decoder_logits_train)
        decoder_predict_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train')
        # 使用sequence_loss计算loss，这里需要传入之前定义的mask标志
        loss = tf.contrib.seq2seq.sequence_loss(logits=decoder_logits_train,
                                                     targets=decoder_targets, weights=mask)

        # Training summary for the current batch_loss
        tf.summary.scalar('loss', loss)
        summary_op = tf.summary.merge_all()

        optimizer = tf.train.AdamOptimizer(learing_rate)
        trainable_params = tf.trainable_variables()
        gradients = tf.gradients(loss, trainable_params)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, max_gradient_norm)
        train_op = optimizer.apply_gradients(zip(clip_gradients, trainable_params))
    elif mode == 'decode':
        start_tokens = tf.ones([batch_size, ], tf.int32) * word_to_idx['<go>']
        end_token = word_to_idx['<eos>']
        # decoder阶段根据是否使用beam_search决定不同的组合，
        # 如果使用则直接调用BeamSearchDecoder（里面已经实现了helper类）
        # 如果不使用则调用GreedyEmbeddingHelper+BasicDecoder的组合进行贪婪式解码
        if beam_search:
            inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=decoder_cell, embedding=embedding,
                                                                     start_tokens=start_tokens, end_token=end_token,
                                                                     initial_state=decoder_initial_state,
                                                                     beam_width=beam_size,
                                                                     output_layer=output_layer)
        else:
            decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=embedding,
                                                                       start_tokens=start_tokens, end_token=end_token)
            inference_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=decoding_helper,
                                                                initial_state=decoder_initial_state,
                                                                output_layer=output_layer)
        decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=inference_decoder,
                                                        maximum_iterations=10)
        # 调用dynamic_decode进行解码，decoder_outputs是一个namedtuple，
        # 对于不使用beam_search的时候，它里面包含两项(rnn_outputs, sample_id)
        # rnn_output: [batch_size, decoder_targets_length, vocab_size]
        # sample_id: [batch_size, decoder_targets_length], tf.int32

        # 对于使用beam_search的时候，它里面包含两项(predicted_ids, beam_search_decoder_output)
        # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果
        # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids)
        # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果
        if beam_search:
            decoder_predict_decode = decoder_outputs.predicted_ids
        else:
            decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id, -1)
# =================================4, 保存模型
saver = tf.train.Saver(tf.global_variables())

<module 'tensorflow.python.util.nest' from '/Users/zhangtaiwei/htdocs/learning/venv/lib/python3.6/site-packages/tensorflow/python/util/nest.py'>


ValueError: Variable decoder/memory_layer/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/Users/zhangtaiwei/htdocs/learning/venv/lib/python3.6/site-packages/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py", line 215, in __init__
    self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
  File "/Users/zhangtaiwei/htdocs/learning/venv/lib/python3.6/site-packages/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py", line 555, in __init__
    name=name)
  File "<ipython-input-94-ceba2e00287c>", line 13, in <module>
    memory_sequence_length=encoder_inputs_length)


# Data 

In [1]:
import pickle
data_path = 'data/dataset-cornell-length10-filter1-vocabSize40000.pkl'
data = pickle.load(open(data_path, 'rb'))

In [17]:
print(data.keys())
print(data['idCount'])
print(type(data['word2id']), list(data['word2id'].items())[:10])
print()
print([[[data['id2word'][i] for i in q], [data['id2word'][i] for i in a]] for q, a in data['trainingSamples'][:1]])
print()
print(type(data['id2word']), list(data['id2word'].items())[:10])

dict_keys(['idCount', 'word2id', 'trainingSamples', 'id2word'])
{}
<class 'dict'> [('non-smoking', 23039), ('seducing', 12205), ('peux', 21580), ('c.p', 7696), ('cache', 16876), ('vitamins', 9025), ('vent', 10424), ('finish', 3606), ('drugstore', 4578), ("g'head", 17181)]

[[['please', '.'], ['saturday', '?', 'night', '?']]]

<class 'dict'> [(0, '<pad>'), (1, '<go>'), (2, '<eos>'), (3, '<unknown>'), (4, 'can'), (5, 'we'), (6, 'make'), (7, 'this'), (8, 'quick'), (9, '?')]
