**Dual LSTM Encoder for Dialog Response Generation**

http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow/

https://github.com/dennybritz/chatbot-retrieval

https://github.com/rkadlec/ubuntu-ranking-dataset-creator

https://arxiv.org/abs/1506.08909

In [None]:
import tensorflow as tf
tf.VERSION

### 1. Download Dataset

In [None]:
# https://stackoverflow.com/a/39225039
import requests

def download_file_from_google_drive(share_id, filename):
    GOOGLE_DRIVE_URL = "https://drive.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(GOOGLE_DRIVE_URL, params={'id': share_id}, stream=True)
    
    token = None
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            token = value
            break

    if not token:
        raise Exception('Token not found')
        
    params = {'id': share_id, 'confirm': token}
    response = session.get(GOOGLE_DRIVE_URL, params=params, stream=True)

    with open(filename, 'wb') as f:
        for chunk in response.iter_content(32768):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [None]:
import os
import tarfile
import shutil

HOME_DIR = 'ubuntu'
DATA_DIR = os.path.join(HOME_DIR, 'data')
DATASET_FILENAME = 'udc.tar.gz'
DATASET_PACKAGE = os.path.join(DATA_DIR, DATASET_FILENAME)
SHARE_ID = '0B_bZck-ksdkpVEtVc1R6Y01HMWM'

TRAIN_CSV = os.path.join(DATA_DIR, 'train.csv')
VALID_CSV = os.path.join(DATA_DIR, 'valid.csv')
TEST_CSV = os.path.join(DATA_DIR, 'test.csv')

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)

train_missing = not os.path.isfile(TRAIN_CSV)
valid_missing = not os.path.isfile(VALID_CSV)
test_missing = not os.path.isfile(TEST_CSV)

missing = train_missing or valid_missing or test_missing

if missing and not os.path.isfile(DATASET_PACKAGE):
    print('Downloading {}...'.format(DATASET_FILENAME))
    download_file_from_google_drive(SHARE_ID, DATASET_PACKAGE)
    print('Done!')

def extract(tar, filename, dst_path):
    print('Extracting', filename)
    dst_file = os.path.join(dst_path, os.path.basename(filename))
    with open(dst_file, 'wb') as fout:
        fin = tar.extractfile(filename)
        shutil.copyfileobj(fin, fout)

if missing:
    with tarfile.open(DATASET_PACKAGE, mode='r:gz') as t:
        if train_missing:
            extract(t, './data/train.csv', DATA_DIR)
        if valid_missing:
            extract(t, './data/valid.csv', DATA_DIR)
        if test_missing:
            extract(t, './data/test.csv', DATA_DIR)

os.listdir(DATA_DIR)

In [None]:
def show(file, lines=3):
    with open(file, 'r') as f:
        for _ in range(lines):
            print(next(f).strip())
            print()

print('Train samples...\n')
show(TRAIN_CSV)
print('...\n')
print('Validation samples...\n')
show(VALID_CSV)
print('...\n')
print('Test samples...\n')
show(TEST_CSV)

### 2. Create Vocabulary

In [None]:
help(tf.contrib.learn.preprocessing.VocabularyProcessor)

**Exploration**

Parameters:

* `max_document_length`
* `min_frequency`

In [None]:
%matplotlib inline

import gc
import pandas as pd
import matplotlib.pyplot as plt

plt.rc('figure', figsize=(16.0, 8.0))

In [None]:
%%time
train_df = pd.read_csv(TRAIN_CSV)
train_df.Label = train_df.Label.astype('category')

In [None]:
%%time
print(train_df.Label.describe())

In [None]:
%%time
train_context_len = train_df.Context.str.split().str.len()
context_stats = train_context_len.describe()
print(context_stats)

In [None]:
import math

Q1 = context_stats['25%']
Q3 = context_stats['75%']

max_len = math.ceil((Q3 + 1.5 * (Q3 - Q1)) / 10) * 10

print(max_len)

In [None]:
train_context_len.hist(bins=100)
plt.axvline(max_len, color='r')
plt.title('Training Context Length Statistics')

In [None]:
%%time
train_utterance_len = train_df.Utterance.str.split().str.len()
print(train_utterance_len.describe())

In [None]:
train_utterance_len.hist(bins=100)
plt.axvline(max_len, color='r')
plt.title('Training Utterance Length Statistics')

In [None]:
del train_context_len
del train_utterance_len
gc.collect()

In [None]:
%%time

import collections

tokens_freq = collections.Counter()
for _, (c, u, _) in train_df.iterrows():
    context_tokens = c.split()
    utterance_tokens = u.split()
    tokens_freq.update(context_tokens)
    tokens_freq.update(utterance_tokens)

print(len(tokens_freq))
print()
for token, freq in tokens_freq.most_common(10):
    print('{:,d}\t{}'.format(freq, token))

In [None]:
tokens_5 = list((token, freq) for token, freq in tokens_freq.items() if freq >= 5)
len(tokens_5)

In [None]:
tokens_5_df = pd.Series(data=list(freq for _, freq in tokens_5))
tokens_5_stats = tokens_5_df.describe()
print(tokens_5_stats)

In [None]:
Q1 = tokens_5_stats['25%']
Q3 = tokens_5_stats['75%']

high_freq = Q3 + 1.5 * (Q3 - Q1)

high_freq

In [None]:
tokens_5_df[tokens_5_df <= high_freq].hist(bins=50)
plt.title('Training Tokens Statistics')

In [None]:
del train_df
del tokens_freq
gc.collect()

**Vocabulary**

In [None]:
import csv

def csv_iterator(filename, cols=[]):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for row in reader:
            if not cols:
                yield row
            else:
                for j in cols:
                    yield row[j]

def train_iterator():
    return csv_iterator(TRAIN_CSV, cols=[0, 1])

train_iter = train_iterator()
for k in range(1, 3):
    print('[', k, '] Context\n')
    print(next(train_iter), '\n')
    print('[', k, '] Utterance\n')
    print(next(train_iter), '\n')
del train_iter

In [None]:
%%time

def tokenizer(sentences):
    return (sentence.split() for sentence in sentences)

vocab = tf.contrib.learn.preprocessing.VocabularyProcessor(
    max_document_length=160, min_frequency=5, tokenizer_fn=tokenizer)

vocab.fit(train_iterator())

print('Vocabulary size: {:,d}'.format(len(vocab.vocabulary_)))
print('Document Length: {:,d}'.format(vocab.max_document_length))

In [None]:
sentence0 = next(train_iterator())
vector0 = next(vocab.transform([sentence0]))

print('Sentence (tokens={:,d}):\n'.format(len(sentence0.split())))
print(sentence0, '\n')
print('Vector (length={:,d}):\n'.format(len(vector0)))
print(vector0)

In [None]:
# pickle will not serialize `tokenizer` function (must be defined before restoring the vocabulary object)
VOCABULARY_FILE = os.path.join(DATA_DIR, 'vocabulary.bin')

if os.path.isfile(VOCABULARY_FILE):
    os.remove(VOCABULARY_FILE)

vocab.save(VOCABULARY_FILE)

os.path.isfile(VOCABULARY_FILE)

## 3. Export data (TFRecord)

In [None]:
class VocabularyAdapter:
    
    def __init__(self, vocab):
        self._vocab = vocab
    
    @property
    def size(self):
        return len(self._vocab.vocabulary_)

    @property
    def vector_length(self):
        return self._vocab.max_document_length
    
    def transform(self, sentence):
        return next(self._vocab.transform([sentence]))
    
    def tokens(self, sentence):
        return next(self._vocab._tokenizer([sentence]))
    
vocab_ = VocabularyAdapter(vocab)

tokens0 = vocab_.tokens(sentence0)
vector0 = vocab_.transform(sentence0)

print('Vocabulary size:\n\n{:,d}\n'.format(vocab_.size))
print('Vector length:\n\n{:,d}\n'.format(vocab_.vector_length))
print('Sentence:\n')
print(sentence0, '\n')
print('Tokens (length={:,d}):\n'.format(len(tokens0)))
print(tokens0, '\n')
print('Vector (length={:,d}):\n'.format(len(vector0)))
print(vector0)

In [None]:
def create_example(vocab_, **kwargs):
    example = tf.train.Example()

    for key, value in kwargs.items():
        if isinstance(value, str):
            vector = vocab_.transform(value)
            length = min(vocab_.vector_length, len(vocab_.tokens(value)))
            example.features.feature[key].int64_list.value.extend(vector)
            example.features.feature[key + '_len'].int64_list.value.extend([length])
        elif isinstance(value, int):
            example.features.feature[key].int64_list.value.extend([value])
        else:
            raise Exception('Unknown: {}:{}'.format(key, type(value)))
    
    return example

with open(TRAIN_CSV, newline='') as f:
    reader = csv.reader(f)
    next(reader) # skip header
    
    for c, u, l in reader:
        example = create_example(vocab_, context=c, utterance=u, label=int(l))
    
        example_str = str(example)
        #print(example_str)
        print(example_str[:107])
        print('  ...')
        print(example_str[-106:-1])
        
        break

In [None]:
with open(VALID_CSV, newline='') as f:
    reader = csv.reader(f)
    next(reader) # skip header
    
    for c, u, *d in reader:
        d_ = dict(('distractor_{}'.format(i), u_) for i, u_ in enumerate(d))
        example = create_example(vocab_, context=c, utterance=u, **d_)
        
        example_str = str(example)
        #print(example_str)
        print(example_str[:107])
        print('  ...')
        print(example_str[-106:-1])
        
        break

In [None]:
for filename in os.listdir(DATA_DIR):
    if filename.endswith('.tfrecords'):
        path = os.path.join(DATA_DIR, filename)
        print('Removing {}...'.format(path))
        os.remove(path)

In [None]:
def train_examples(vocab_, filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for c, u, l in reader:
            yield create_example(vocab_, context=c, utterance=u, label=int(l))

def eval_examples(vocab_, filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for c, u, *d in reader:
            d_ = dict(('distractor_{}'.format(i), u_) for i, u_ in enumerate(d))
            yield create_example(vocab_, context=c, utterance=u, **d_)

def save_tfrecords(input_examples, filename):
    print("Saving TFRecords at {}...".format(filename))
    n = 0
    with tf.python_io.TFRecordWriter(filename) as writer:
        for x in input_examples:
            writer.write(x.SerializeToString())
            n += 1
    print('Total records: {:,d}'.format(n))

TRAIN_TFR = os.path.join(DATA_DIR, 'train.tfrecords')
%time save_tfrecords(train_examples(vocab_, TRAIN_CSV), TRAIN_TFR)

VALID_TFR = os.path.join(DATA_DIR, 'valid.tfrecords')
%time save_tfrecords(eval_examples(vocab_, VALID_CSV), VALID_TFR)

TEST_TFR = os.path.join(DATA_DIR, 'test.tfrecords')
%time save_tfrecords(eval_examples(vocab_, TEST_CSV), TEST_TFR)