**Dual LSTM Encoder for Dialog Response Generation**

http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/

https://github.com/dennybritz/chatbot-retrieval

https://github.com/rkadlec/ubuntu-ranking-dataset-creator

https://arxiv.org/abs/1506.08909

In [None]:
import tensorflow as tf
tf.VERSION

## 1. Word Embedding

In [None]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

In [None]:
vocab_size = 4
embed_size = 2

word_0 = [0, 0]
word_1 = [1, 0]
word_2 = [0, 1]
word_3 = [1, 1]

embeddings = tf.stack([word_0, word_1, word_2, word_3])

print('Embeddings:\n')
print(embeddings)
embeddings.eval()

In [None]:
batch_size = 2
sentence_length = 3

sentence_0 = [0, 3, 2]
sentence_1 = [3, 1, 0]

input_data = tf.stack([sentence_0, sentence_1])

print('Sentences:\n')
print(input_data)
input_data.eval()

In [None]:
input_embed = tf.nn.embedding_lookup(embeddings, input_data)

print('Input:\n')
print(input_embed)
input_embed.eval()

In [None]:
session.close()
del graph

## 2. Pairing

Similarity between **`c`** and **`c'`**, where **`c' = Mr`**.

**`c`** -> encoded context vector

**`r`** -> encoded response vector

**`M`** -> translate responce to context, matrix

In [None]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

In [None]:
M = tf.constant([[1, 2], [3, 4]])

print('M', M.shape, '\n')
print(M.eval(), '\n')

In [None]:
c = tf.constant([[1], [2]])
r = tf.constant([[3], [4]])

ct_M = tf.matmul(c, M, transpose_a=True)
ct_M_r = tf.matmul(ct_M, r)

print('c', c.shape, '\n')
print(c.eval(), '\n')
print('r', r.shape, '\n')
print(r.eval(), '\n')
print('ct * M', ct_M.shape, '\n')
print(ct_M.eval(), '\n')
print('ct * M * r', ct_M_r.shape, '\n')
print(ct_M_r.eval(), '\n')

In [None]:
ct = tf.constant([[1, 2], [0, 0]])
rt = tf.constant([[3, 4], [0, 0]])

ct_M = tf.matmul(ct, M)
ct_M_r = tf.matmul(ct_M, rt, transpose_b=True)

print('ct', ct.shape, '\n')
print(ct.eval(), '\n')
print('rt', rt.shape, '\n')
print(rt.eval(), '\n')
print('ct * M', ct_M.shape, '\n')
print(ct_M.eval(), '\n')
print('ct * M * r', ct_M_r.shape, '\n')
print(ct_M_r.eval(), '\n')

In [None]:
c = tf.constant([[5], [6]])
r = tf.constant([[7], [8]])

ct_M = tf.matmul(c, M, transpose_a=True)
ct_M_r = tf.matmul(ct_M, r)

print('c', c.shape, '\n')
print(c.eval(), '\n')
print('r', r.shape, '\n')
print(r.eval(), '\n')
print('ct * M', ct_M.shape, '\n')
print(ct_M.eval(), '\n')
print('ct * M * r', ct_M_r.shape, '\n')
print(ct_M_r.eval(), '\n')

In [None]:
ct = tf.constant([[1, 2], [5, 6]])
rt = tf.constant([[3, 4], [7, 8]])

ct_M = tf.matmul(ct, M)
ct_M_r = tf.matmul(ct_M, rt, transpose_b=True)

print('ct', ct.shape, '\n')
print(ct.eval(), '\n')
print('rt', rt.shape, '\n')
print(rt.eval(), '\n')
print('ct * M', ct_M.shape, '\n')
print(ct_M.eval(), '\n')
print('ct * M * r', ct_M_r.shape, '\n')
print(ct_M_r.eval(), '\n')

In [None]:
ct = tf.constant([[1, 2], [5, 6]])
rt = tf.constant([[3, 4], [7, 8]])

ct_M = tf.matmul(ct, M)

batch_ct_M = tf.expand_dims(ct_M, axis=2)
batch_rt = tf.expand_dims(rt, axis=2)

batch_ct_M_r = tf.matmul(batch_ct_M, batch_rt, transpose_a=True)

ct_M_r = tf.squeeze(batch_ct_M_r, axis=2)

print('ct', ct.shape, '\n')
print(ct.eval(), '\n')
print('rt', rt.shape, '\n')
print(rt.eval(), '\n')
print('ct * M', ct_M.shape, '\n')
print(ct_M.eval(), '\n')
print('ct * M (batch)', batch_ct_M.shape, '\n')
print(batch_ct_M.eval(), '\n')
print('rt (batch)', batch_rt.shape, '\n')
print(batch_rt.eval(), '\n')
print('ct * M * r (batch)', batch_ct_M_r.shape, '\n')
print(batch_ct_M_r.eval(), '\n')
print('ct * M * r', ct_M_r.shape, '\n')
print(ct_M_r.eval(), '\n')

In [None]:
session.close()
del graph

## 3. Dual LSTM Encoder

In [None]:
graph = tf.Graph()
graph.as_default()
session = tf.InteractiveSession(graph=graph)
session

In [None]:
vocab_size = 25
sentence_size = 4
batch_size = 2
embed_size = 5
hidden_size = 8

**Input Sentence (Dataset)**

In [None]:
input_context = tf.random_uniform(
    shape=(batch_size, sentence_size), minval=0, maxval=vocab_size, dtype=tf.int64)

print(input_context)
input_context.eval()

In [None]:
input_utterance = tf.random_uniform(
    shape=(batch_size, sentence_size), minval=0, maxval=vocab_size, dtype=tf.int64)

print(input_utterance)
input_utterance.eval()

In [None]:
input_context_len = tf.constant(sentence_size, shape=(batch_size, 1))

print(input_context_len)
input_context_len.eval()

In [None]:
input_utterance_len = tf.constant(sentence_size, shape=(batch_size, 1))

print(input_utterance_len)
input_utterance_len.eval()

**Dual Encoder - Input**

Encode Context and Utterance together.

Concatenated tensors to encode both sentences in a single pass.

In [None]:
input_data = tf.concat([input_context, input_utterance], axis=0)

print(input_data)
input_data.eval()

In [None]:
input_length = tf.concat([input_context_len, input_utterance_len], axis=0)

print(input_length)
input_length.eval()

In [None]:
input_length = tf.reshape(input_length, [-1])

print(input_length)
input_length.eval()

**Word Embedding**

In [None]:
embeddings = tf.Variable(
    tf.random_uniform(shape=(vocab_size, embed_size), minval=-0.25, maxval=0.25))

embeddings.initializer.run()

print(embeddings)
embeddings.eval()

In [None]:
input_embed = tf.nn.embedding_lookup(embeddings, input_data)

print(input_embed)
input_embed.eval() 

**LSTM Encoder**

https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/rnn/LSTMCell

In [None]:
cell = tf.nn.rnn_cell.LSTMCell(
    hidden_size,
    forget_bias=2.0,
    use_peepholes=True,
    state_is_tuple=True)

outputs, states = tf.nn.dynamic_rnn(
    cell,
    input_embed,
    sequence_length=input_length,
    dtype=tf.float32)

for tv in cell.trainable_variables:
    tv.initializer.run()

print('Outputs:\n')
print(outputs)
print()
print('Final states:\n')
print(states)

**Dual Encoder - Output**

Split the encoded vector of each sentece type.

In [None]:
context_encoding, utterance_encoding = tf.split(states.h, num_or_size_splits=2, axis=0)

In [None]:
print(context_encoding)
context_encoding.eval()

In [None]:
print(utterance_encoding)
utterance_encoding.eval()

**Prediction**

In [None]:
ct = context_encoding
rt = utterance_encoding

M = tf.Variable(tf.truncated_normal(shape=(hidden_size, hidden_size)))

M.initializer.run()

print(M)
M.eval()

In [None]:
ct_M = tf.matmul(ct, M)

print(ct_M)
ct_M.eval()

In [None]:
batch_ct_M = tf.expand_dims(ct_M, axis=2)

print(batch_ct_M)
batch_ct_M.eval()

In [None]:
batch_rt = tf.expand_dims(rt, axis=2)

print(batch_rt)
batch_rt.eval()

In [None]:
batch_ct_M_r = tf.matmul(batch_ct_M, batch_rt, transpose_a=True)
ct_M_r = tf.squeeze(batch_ct_M_r, axis=2)

print(ct_M_r)
ct_M_r.eval()

In [None]:
b = tf.Variable(0, dtype=tf.float32)

b.initializer.run()

print(b)
b.eval()

In [None]:
logits = ct_M_r + b
probs = tf.sigmoid(logits)

print(probs)
probs.eval()

**Loss**

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/losses/sigmoid_cross_entropy

In [None]:
# Targets:
# For each pair (context, utterance)
# 1 -> utterance is the correct sentence related to context
# 0 -> utterance is a random sentence related to other context
targets = tf.constant([1, 0], shape=(2, 1))

print(targets)
targets.eval()

In [None]:
loss = tf.losses.sigmoid_cross_entropy(
    multi_class_labels=targets, logits=logits, reduction=tf.losses.Reduction.MEAN)

print(loss)
loss.eval()

In [None]:
session.close()
del graph

In [None]:
def dual_encoder(vocab_size,
                 embed_size,
                 hidden_size,
                 input_context,
                 input_context_len,
                 input_utterance,
                 input_utterance_len,
                 targets):

    input_data = tf.concat([input_context, input_utterance], axis=0)
    input_length = tf.concat([input_context_len, input_utterance_len], axis=0)
    input_length = tf.reshape(input_length, [-1])
    
    embeddings = tf.get_variable(
        'embeddings',
        shape=(vocab_size, embed_size),
        initializer=tf.random_uniform_initializer(-0.25, 0.25))

    input_embed = tf.nn.embedding_lookup(
        embeddings, input_data, name='input_embed')
        
    with tf.variable_scope('rnn'):
        cell = tf.nn.rnn_cell.LSTMCell(
            hidden_size,
            forget_bias=2.0,
            use_peepholes=True,
            state_is_tuple=True)

        outputs, states = tf.nn.dynamic_rnn(
            cell,
            input_embed,
            sequence_length=input_length,
            dtype=tf.float32)

        context_encoding, utterance_encoding = tf.split(
            states.h, num_or_size_splits=2, axis=0)

    with tf.variable_scope('prediction'):
        ct = context_encoding
        rt = utterance_encoding
        M = tf.get_variable(
            'M',
            shape=(hidden_size, hidden_size),
            initializer=tf.truncated_normal_initializer())

        ct_M = tf.matmul(ct, M)
        batch_ct_M = tf.expand_dims(ct_M, axis=2)
        batch_rt = tf.expand_dims(rt, axis=2)
        batch_ct_M_r = tf.matmul(batch_ct_M, batch_rt, transpose_a=True)
        ct_M_r = tf.squeeze(batch_ct_M_r, axis=2)

        b = tf.get_variable(
            'b', shape=(), initializer=tf.zeros_initializer())
        
        logits = ct_M_r + b
        
        probs = tf.sigmoid(logits)

    if targets is None:
        return probs, None

    loss = tf.losses.sigmoid_cross_entropy(
        multi_class_labels=targets, logits=logits, reduction=tf.losses.Reduction.MEAN)
    
    return probs, loss


graph = tf.Graph()
with graph.as_default(), tf.Session(graph=graph) as session:
    vocab_size = 100000
    embed_size = 100
    hidden_size = 200

    batch_size = 128
    sentence_size = 160
    input_context = tf.random_uniform(
        shape=(batch_size, sentence_size), minval=0, maxval=vocab_size, dtype=tf.int64)
    input_context_len = tf.constant(sentence_size, shape=(batch_size, 1))
    input_utterance = tf.random_uniform(
        shape=(batch_size, sentence_size), minval=0, maxval=vocab_size, dtype=tf.int64)
    input_utterance_len = tf.constant(sentence_size, shape=(batch_size, 1))
    targets = tf.random_uniform(
        shape=(batch_size, 1), minval=0, maxval=1, dtype=tf.int64)    
    
    _, loss = dual_encoder(vocab_size,
                           embed_size,
                           hidden_size,
                           input_context,
                           input_context_len,
                           input_utterance,
                           input_utterance_len,
                           targets)
    
    init = tf.global_variables_initializer()
    session.run(init)
    
    loss_value = session.run(loss)
    
    print('Average loss: {:,.3f}'.format(loss_value))

del graph