**Dual LSTM Encoder for Dialog Response Generation**

http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/

https://github.com/dennybritz/chatbot-retrieval

https://github.com/rkadlec/ubuntu-ranking-dataset-creator

https://arxiv.org/abs/1506.08909

In [None]:
import tensorflow as tf
tf.VERSION

In [None]:
import os

HOME_DIR = 'ubuntu'
DATA_DIR = os.path.join(HOME_DIR, 'data')
VOCAB_BIN = os.path.join(DATA_DIR, 'vocabulary.bin')
TRAIN_TFR = os.path.join(DATA_DIR, 'train.tfrecords')
VALID_TFR = os.path.join(DATA_DIR, 'valid.tfrecords')
TEST_TFR = os.path.join(DATA_DIR, 'test.tfrecords')

if not os.path.isfile(VOCAB_BIN):
    raise Exception('File not found: {}'.format(VOCAB_BIN))

if not os.path.isfile(TRAIN_TFR):
    raise Exception('File not found: {}'.format(TRAIN_TFR))

if not os.path.isfile(VALID_TFR):
    raise Exception('File not found: {}'.format(VALID_TFR))

if not os.path.isfile(TEST_TFR):
    raise Exception('File not found: {}'.format(TEST_TFR))

os.listdir(DATA_DIR)

**Vocabulary**

In [None]:
# `tokenizer` function must be defined before restoring the vocabulary object
# (pickle does not serialize functions)
def tokenizer(sentences):
    return (sentence.split() for sentence in sentences)

class VocabularyAdapter:
    
    def __init__(self, vocabulary_bin):
        self._vocab = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_bin)
    
    @property
    def size(self):
        return len(self._vocab.vocabulary_)
    
    @property
    def vector_length(self):
        return self._vocab.max_document_length

vocab = VocabularyAdapter(VOCAB_BIN)

print('Vocabulary size: {:,d}'.format(vocab.size))
print('Vector length: {:,d}'.format(vocab.vector_length))

**Simple TFRecord + Example reader**

https://www.tensorflow.org/versions/r1.2/programmers_guide/reading_data

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_single_example

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_example

In [None]:
with tf.Graph().as_default(), tf.Session() as session:
    filename_queue = tf.train.string_input_producer([TRAIN_TFR])

    reader = tf.TFRecordReader()
    key, value = reader.read(filename_queue)

    example_features = {
        'context': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'context_len': tf.FixedLenFeature((), dtype=tf.int64),
        'utterance': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'utterance_len': tf.FixedLenFeature((), dtype=tf.int64),
        'label': tf.FixedLenFeature((), dtype=tf.int64),
    }

    example = tf.parse_single_example(value, example_features)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(3):
        example_ = session.run(example)
        context = example_['context']
        context_len = example_['context_len']
        utterance = example_['utterance']
        utterance_len = example_['utterance_len']
        label = example_['label']
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

**Input function for Estimators (tf.contrib.learn)**

https://www.tensorflow.org/versions/r1.2/get_started/input_fn

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/learn/read_batch_record_features

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/layers/feature_column

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/layers/create_feature_spec_for_parsing

In [None]:
def input_features(vector_length):
    return [
        tf.contrib.layers.real_valued_column(
            column_name='context', dimension=vector_length, dtype=tf.int64),
        tf.contrib.layers.real_valued_column(
            column_name='context_len', dimension=1, dtype=tf.int64),
        tf.contrib.layers.real_valued_column(
            column_name='utterance', dimension=vector_length, dtype=tf.int64),
        tf.contrib.layers.real_valued_column(
            column_name='utterance_len', dimension=1, dtype=tf.int64),
        tf.contrib.layers.real_valued_column(
            column_name='label', dimension=1, dtype=tf.int64),
    ]


features = input_features(vocab.vector_length)

for x in features:
    print(x)

In [None]:
tf.contrib.layers.create_feature_spec_for_parsing(features)

In [None]:
with tf.Graph().as_default(), tf.Session() as session:
    example_features = tf.contrib.layers.create_feature_spec_for_parsing(features)
    
    batch_example = tf.contrib.learn.read_batch_record_features(
        file_pattern=[TRAIN_TFR],
        batch_size=3,
        features=example_features
    )
    
    print(batch_example, '\n')
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    examples = session.run(batch_example)
    
    for i in range(3):
        context = examples['context'][i]
        context_len = examples['context_len'][i]
        utterance = examples['utterance'][i]
        utterance_len = examples['utterance_len'][i]
        label = examples['label'][i]
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

In [None]:
def input_fn(name, filenames, features, batch_size, num_epochs=None):
    example_features = tf.contrib.layers.create_feature_spec_for_parsing(features)

    batch_example = tf.contrib.learn.read_batch_record_features(
        file_pattern=filenames,
        features=example_features,
        batch_size=batch_size,
        num_epochs=num_epochs,
        name='read_batch_record_features_' + name
    )

    batch_target = batch_example.pop('label')

    return batch_example, batch_target

input_fn_train = lambda: input_fn('train', [TRAIN_TFR], features, 128, 1)
input_fn_valid = lambda: input_fn('valid', [VALID_TFR], features, 16, 1)
input_fn_test = lambda: input_fn('test', [TEST_TFR], features, 16, 1)

with tf.Graph().as_default(), tf.Session() as session:
    train_data = input_fn_train()
    valid_data = input_fn_valid()
    test_data = input_fn_test()
    
    tf.local_variables_initializer().run()

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    print('[ Training ]\n')
    
    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(train_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Validation ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(valid_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Test ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(test_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    coord.request_stop()
    coord.join(threads)