**Dual LSTM Encoder for Dialog Response Generation**

http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/

https://github.com/dennybritz/chatbot-retrieval

https://github.com/rkadlec/ubuntu-ranking-dataset-creator

https://arxiv.org/abs/1506.08909

In [1]:
import tensorflow as tf
tf.VERSION

'1.2.0'

In [2]:
import os

HOME_DIR = 'ubuntu'
DATA_DIR = os.path.join(HOME_DIR, 'data')
VOCAB_BIN = os.path.join(DATA_DIR, 'vocabulary.bin')
TRAIN_TFR = os.path.join(DATA_DIR, 'train.tfrecords')
VALID_TFR = os.path.join(DATA_DIR, 'valid.tfrecords')
TEST_TFR = os.path.join(DATA_DIR, 'test.tfrecords')

def has_file(file):
    if not os.path.isfile(file):
        raise Exception('File not found: {}'.format(file))

has_file(VOCAB_BIN)
has_file(TRAIN_TFR)
has_file(VALID_TFR)
has_file(TEST_TFR)

os.listdir(DATA_DIR)

['udc.tar.gz',
 'train.csv',
 'valid.csv',
 'test.csv',
 'vocabulary.bin',
 'train.tfrecords',
 'valid.tfrecords',
 'test.tfrecords']

**Vocabulary**

In [3]:
# `tokenizer` function must be defined before restoring the vocabulary object
# (pickle does not serialize functions)
def tokenizer(sentences):
    return (sentence.split() for sentence in sentences)

class VocabularyAdapter:
    
    def __init__(self, vocabulary_bin):
        self._vocab = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_bin)
    
    @property
    def size(self):
        return len(self._vocab.vocabulary_)
    
    @property
    def vector_length(self):
        return self._vocab.max_document_length

vocab = VocabularyAdapter(VOCAB_BIN)

print('Vocabulary size: {:,d}'.format(vocab.size))
print('Vector length: {:,d}'.format(vocab.vector_length))

Vocabulary size: 91,619
Vector length: 160


**Simple TFRecord + Example reader**

https://www.tensorflow.org/versions/r1.2/programmers_guide/reading_data

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_single_example

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_example

In [4]:
with tf.Graph().as_default(), tf.Session() as session:
    filename_queue = tf.train.string_input_producer([TRAIN_TFR])

    reader = tf.TFRecordReader()
    key, value = reader.read(filename_queue)

    example_features = {
        'context': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'context_len': tf.FixedLenFeature((), dtype=tf.int64),
        'utterance': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'utterance_len': tf.FixedLenFeature((), dtype=tf.int64),
        'label': tf.FixedLenFeature((), dtype=tf.int64),
    }

    example = tf.parse_single_example(value, example_features)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(3):
        example_ = session.run(example)
        context = example_['context']
        context_len = example_['context_len']
        utterance = example_['utterance']
        utterance_len = example_['utterance_len']
        label = example_['label']
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

[ Example 0 ]

context

[    3    73   262   108   756     4   301   837   341  1271     5    28
    53    40   262    56     8    90   341   920    10     3    73     9
     6   660   197  1057     4  1236    20   576   348    13   197   756
   259   119    13    40     1     2     9    81     6   218   419     8
   290    11  2429  2628    25     0     1   153   134     1     2    93
     1   534   262    23  1598    62  4389     0     1     2   144     7
     1     2    70     1     2    29   340     1     9     6  1935   118
    50 10075     7     1     2    79     1    28     9    33   156    29
     4   111   129    17   265     1    32    11   116  2071  7281    18
   409   808     1     2    12    67     6   400   158    50  2071   983
     7     1   529    11   312   731   106   613    26     8    14    37
     4  6645    51    29    14    26    12   819     1     2    38  3721
     6   164 65630     7     1     2    29   948  3721     5    28    11
  1447  5455     1     2]



**Input function for Estimators**

https://www.tensorflow.org/versions/r1.2/get_started/input_fn

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/learn/read_batch_record_features

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/feature_column

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/feature_column/make_parse_example_spec

In [5]:
def features_train(vector_length):
    return [
        tf.feature_column.numeric_column(
            key='context', shape=vector_length, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='context_len', shape=1, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='utterance', shape=vector_length, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='utterance_len', shape=1, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='label', shape=1, dtype=tf.int64),
    ]

train_features = features_train(vocab.vector_length)

for x in train_features:
    print(x)

_NumericColumn(key='context', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='context_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='label', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)


In [6]:
tf.feature_column.make_parse_example_spec(train_features)

{'context': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'context_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'label': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'utterance': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'utterance_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None)}

In [7]:
def features_eval(vector_length):
    features = []
    keys = ['context', 'utterance']
    keys += ['distractor_{}'.format(i) for i in range(9)]
    for key in keys:
        features += [
            tf.feature_column.numeric_column(
                key=key, shape=vector_length, dtype=tf.int64),
            tf.feature_column.numeric_column(
                key=key + '_len', shape=1, dtype=tf.int64),
        ]
    return features

eval_features = features_eval(vocab.vector_length)

for x in eval_features:
    print(x)

_NumericColumn(key='context', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='context_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_0', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_0_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_1', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_1_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_2', shape=(160,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='distractor_2_len', shape=(1,), default_valu

In [8]:
tf.feature_column.make_parse_example_spec(eval_features)

{'context': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'context_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'distractor_0': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'distractor_0_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'distractor_1': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'distractor_1_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'distractor_2': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'distractor_2_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'distractor_3': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'distractor_3_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'distractor_4': FixedLenFeature(shape=(160,), dtype=tf.int64, default_value=None),
 'distractor_4_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_val

In [9]:
with tf.Graph().as_default(), tf.Session() as session:
    example_features = tf.feature_column.make_parse_example_spec(train_features)
    
    batch_example = tf.contrib.learn.read_batch_record_features(
        file_pattern=[TRAIN_TFR],
        batch_size=3,
        features=example_features
    )
    
    print(batch_example, '\n')
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    examples = session.run(batch_example)
    
    for i in range(3):
        context = examples['context'][i]
        context_len = examples['context_len'][i]
        utterance = examples['utterance'][i]
        utterance_len = examples['utterance_len'][i]
        label = examples['label'][i]
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

{'context': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:0' shape=(3, 160) dtype=int64>, 'context_len': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:1' shape=(3, 1) dtype=int64>, 'label': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:2' shape=(3, 1) dtype=int64>, 'utterance': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:3' shape=(3, 160) dtype=int64>, 'utterance_len': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:4' shape=(3, 1) dtype=int64>} 

[ Example 0 ]

context

[  137    57     3    56    89    71    30     3   199     8   358     0
    19  1267  1351    10     0    82   280     6    93    43  2118     5
  3143     5  1127     5   684    27    28   141   123    21   315     8
     4   104    30     9   161    40    33    11    77    32   315   638
     5    28     3    14    21    52   126     8    92    10 38327   161
    93     5   142    58    63     4     0   219    54    30    68  1464
   870     7     7     1     2   180   70

In [10]:
def _input_reader(name, filenames, features, batch_size, num_epochs):
    example_features = tf.feature_column.make_parse_example_spec(features)
    return tf.contrib.learn.read_batch_record_features(
        file_pattern=filenames,
        features=example_features,
        batch_size=batch_size,
        num_epochs=num_epochs,
        randomize_input=True,
        queue_capacity=200000 + batch_size * 10,
        name='read_batch_record_features_' + name
    )


def input_train(name, filenames, features, batch_size, num_epochs=None):
    batch_example = _input_reader(name, filenames, features, batch_size, num_epochs)
    batch_target = batch_example.pop('label')
    return batch_example, batch_target

def input_eval(name, filenames, features, batch_size, num_epochs=None):
    batch_example = _input_reader(name, filenames, features, batch_size, num_epochs)
    batch_target = tf.zeros_like(batch_example['context_len'])
    return batch_example, batch_target

input_fn_train = lambda: input_train('train', [TRAIN_TFR], train_features, 128, 1)
input_fn_valid = lambda: input_eval('valid', [VALID_TFR], eval_features, 16, 1)
input_fn_test = lambda: input_eval('test', [TEST_TFR], eval_features, 16, 1)

with tf.Graph().as_default(), tf.Session() as session:
    train_data = input_fn_train()
    valid_data = input_fn_valid()
    test_data = input_fn_test()
    
    tf.local_variables_initializer().run()

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    print('[ Training ]\n')
    
    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(train_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Validation ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(valid_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Test ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(test_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    coord.request_stop()
    coord.join(threads)

[ Training ]

... 128,000 examples
... 256,000 examples
... 384,000 examples
... 512,000 examples
... 640,000 examples
... 768,000 examples
... 896,000 examples
... 1,000,000 examples
Epoch limit reached

[ Validation ]

... 16,000 examples
... 19,560 examples
Epoch limit reached

[ Test ]

... 16,000 examples
... 18,920 examples
Epoch limit reached
