**Dual LSTM Encoder for Dialog Response Generation**

http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/

https://github.com/dennybritz/chatbot-retrieval

https://github.com/rkadlec/ubuntu-ranking-dataset-creator

https://arxiv.org/abs/1506.08909

In [1]:
import tensorflow as tf
tf.VERSION

'1.2.0'

In [2]:
import os

HOME_DIR = 'ubuntu'
DATA_DIR = os.path.join(HOME_DIR, 'data')
VOCAB_BIN = os.path.join(DATA_DIR, 'vocabulary.bin')
TRAIN_TFR = os.path.join(DATA_DIR, 'train.tfrecords')
VALID_TFR = os.path.join(DATA_DIR, 'valid.tfrecords')
TEST_TFR = os.path.join(DATA_DIR, 'test.tfrecords')

if not os.path.isfile(VOCAB_BIN):
    raise Exception('File not found: {}'.format(VOCAB_BIN))

if not os.path.isfile(TRAIN_TFR):
    raise Exception('File not found: {}'.format(TRAIN_TFR))

if not os.path.isfile(VALID_TFR):
    raise Exception('File not found: {}'.format(VALID_TFR))

if not os.path.isfile(TEST_TFR):
    raise Exception('File not found: {}'.format(TEST_TFR))

os.listdir(DATA_DIR)

['udc.tar.gz',
 'train.csv',
 'valid.csv',
 'test.csv',
 'vocabulary.bin',
 'train.tfrecords',
 'valid.tfrecords',
 'test.tfrecords']

**Vocabulary**

In [3]:
# `tokenizer` function must be defined before restoring the vocabulary object
# (pickle does not serialize functions)
def tokenizer(sentences):
    return (sentence.split() for sentence in sentences)

class VocabularyAdapter:
    
    def __init__(self, vocabulary_bin):
        self._vocab = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(vocabulary_bin)
    
    @property
    def size(self):
        return len(self._vocab.vocabulary_)
    
    @property
    def vector_length(self):
        return self._vocab.max_document_length

vocab = VocabularyAdapter(VOCAB_BIN)

print('Vocabulary size: {:,d}'.format(vocab.size))
print('Vector length: {:,d}'.format(vocab.vector_length))

Vocabulary size: 91,619
Vector length: 220


**Simple TFRecord + Example reader**

https://www.tensorflow.org/versions/r1.2/programmers_guide/reading_data

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_single_example

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/parse_example

In [4]:
with tf.Graph().as_default(), tf.Session() as session:
    filename_queue = tf.train.string_input_producer([TRAIN_TFR])

    reader = tf.TFRecordReader()
    key, value = reader.read(filename_queue)

    example_features = {
        'context': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'context_len': tf.FixedLenFeature((), dtype=tf.int64),
        'utterance': tf.FixedLenFeature(vocab.vector_length, dtype=tf.int64),
        'utterance_len': tf.FixedLenFeature((), dtype=tf.int64),
        'label': tf.FixedLenFeature((), dtype=tf.int64),
    }

    example = tf.parse_single_example(value, example_features)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(3):
        example_ = session.run(example)
        context = example_['context']
        context_len = example_['context_len']
        utterance = example_['utterance']
        utterance_len = example_['utterance_len']
        label = example_['label']
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

[ Example 0 ]

context

[   3   61   29  309   63   69   98    4   66   12  866   10    1    2   93
    5    9  153   60   12  183 1023   32   47    5   59    1  196  581  197
   45    4   66  262  866   46    5   31  171    6   45    4   66  262   56
    8 2422   46    1    2    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    

**Input function for Estimators**

https://www.tensorflow.org/versions/r1.2/get_started/input_fn

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/learn/read_batch_record_features

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/feature_column

https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/feature_column/make_parse_example_spec

In [5]:
def input_features(vector_length):
    return [
        tf.feature_column.numeric_column(
            key='context', shape=vector_length, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='context_len', shape=1, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='utterance', shape=vector_length, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='utterance_len', shape=1, dtype=tf.int64),
        tf.feature_column.numeric_column(
            key='label', shape=1, dtype=tf.int64),
    ]


features = input_features(vocab.vector_length)

for x in features:
    print(x)

_NumericColumn(key='context', shape=(220,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='context_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance', shape=(220,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='utterance_len', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)
_NumericColumn(key='label', shape=(1,), default_value=None, dtype=tf.int64, normalizer_fn=None)


In [6]:
tf.feature_column.make_parse_example_spec(features)

{'context': FixedLenFeature(shape=(220,), dtype=tf.int64, default_value=None),
 'context_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'label': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None),
 'utterance': FixedLenFeature(shape=(220,), dtype=tf.int64, default_value=None),
 'utterance_len': FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=None)}

In [7]:
with tf.Graph().as_default(), tf.Session() as session:
    example_features = tf.feature_column.make_parse_example_spec(features)
    
    batch_example = tf.contrib.learn.read_batch_record_features(
        file_pattern=[TRAIN_TFR],
        batch_size=3,
        features=example_features
    )
    
    print(batch_example, '\n')
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    examples = session.run(batch_example)
    
    for i in range(3):
        context = examples['context'][i]
        context_len = examples['context_len'][i]
        utterance = examples['utterance'][i]
        utterance_len = examples['utterance_len'][i]
        label = examples['label'][i]
        print('[ Example {} ]\n'.format(i))
        print('context\n\n{}\n'.format(context))
        print('context_len\n\n{}\n'.format(context_len))
        print('utterance\n\n{}\n'.format(utterance))
        print('utterance_len\n\n{}\n'.format(utterance_len))
        print('label\n\n{}\n'.format(label))

    coord.request_stop()
    coord.join(threads)

{'context': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:0' shape=(3, 220) dtype=int64>, 'context_len': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:1' shape=(3, 1) dtype=int64>, 'label': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:2' shape=(3, 1) dtype=int64>, 'utterance': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:3' shape=(3, 220) dtype=int64>, 'utterance_len': <tf.Tensor 'dequeue_record_examples/fifo_queue_Dequeue:4' shape=(3, 1) dtype=int64>} 

[ Example 0 ]

context

[ 578    1  113    0 3396  625  477  261   19  429 2215   10    1  113  230
 3396  538    1 1765    1  658    1  936    1 1468    7    4  338  379  450
  130   47    3  165  272  274    7    1    6    9 7209    7    1 8915   91
  104 7066   91  338    1   87   24    4  338  379    7    1    2    6   12
  214   18   50  274    7    1  646    5    9   33   11 1091 1635    1   12
   23   24 1576    8  306   54    1    2   23  221  562   47  294    9    1
    2    0    0    0   

In [8]:
def input_fn(name, filenames, features, batch_size, num_epochs=None):
    example_features = tf.feature_column.make_parse_example_spec(features)

    batch_example = tf.contrib.learn.read_batch_record_features(
        file_pattern=filenames,
        features=example_features,
        batch_size=batch_size,
        num_epochs=num_epochs,
        name='read_batch_record_features_' + name
    )

    batch_target = batch_example.pop('label')

    return batch_example, batch_target

input_fn_train = lambda: input_fn('train', [TRAIN_TFR], features, 128, 1)
input_fn_valid = lambda: input_fn('valid', [VALID_TFR], features, 16, 1)
input_fn_test = lambda: input_fn('test', [TEST_TFR], features, 16, 1)

with tf.Graph().as_default(), tf.Session() as session:
    train_data = input_fn_train()
    valid_data = input_fn_valid()
    test_data = input_fn_test()
    
    tf.local_variables_initializer().run()

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    print('[ Training ]\n')
    
    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(train_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Validation ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(valid_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    print()
    print('[ Test ]\n')

    batch_count = 0
    examples_count = 0

    try:
        while not coord.should_stop():
            data = session.run(test_data)
            examples_count += len(data[0]['context'])
            batch_count += 1
            if batch_count % 1000 == 0:
                print('... {:,d} examples'.format(examples_count))
    except tf.errors.OutOfRangeError:
        print('... {:,d} examples'.format(examples_count))
        print('Epoch limit reached')
    
    coord.request_stop()
    coord.join(threads)

[ Training ]

... 128,000 examples
... 256,000 examples
... 384,000 examples
... 512,000 examples
... 640,000 examples
... 768,000 examples
... 896,000 examples
... 938,593 examples
Epoch limit reached

[ Validation ]

... 16,000 examples
... 32,000 examples
... 48,000 examples
... 64,000 examples
... 80,000 examples
... 96,000 examples
... 112,000 examples
... 128,000 examples
... 144,000 examples
... 160,000 examples
... 176,000 examples
... 183,245 examples
Epoch limit reached

[ Test ]

... 16,000 examples
... 32,000 examples
... 48,000 examples
... 64,000 examples
... 80,000 examples
... 96,000 examples
... 112,000 examples
... 128,000 examples
... 144,000 examples
... 160,000 examples
... 176,000 examples
... 176,256 examples
Epoch limit reached
