In [36]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from time import time
import gensim
from gensim.models.keyedvectors import KeyedVectors                         
tf.logging.set_verbosity(tf.logging.ERROR)

### Create vocabulary: make a list of all words seen in the training set. Lowercase all words, include punctuation marks as well. Save the vocabulary to a file.

In [50]:
def tokenize_string(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`’:]", " ", string)  
    string = re.sub(r"’", "'", string) 
    string = re.sub(r"`", "'", string) 
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r":", " : ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " ( ", string) 
    string = re.sub(r"\)", " ) ", string) 
    string = re.sub(r"\?", " ? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip()

def update_vocabulary(tokens, counter):
    counter.update(tokens)   

def create_vocab(pd_series, min_count_word=3):
    vocabulary = Counter()
    _ = pd_series.apply(lambda x: update_vocabulary(tokenize_string(x).casefold().split(), vocabulary))
    vocabulary = [tok for tok, count in vocabulary.items() if count >= min_count_word]
    vocabulary += ['<PAD>']
    return vocabulary

def save_vocab_to_txt_file(vocab, txt_path):
    """Writes one token per line, 0-based line id corresponds to the id of the token.
    Args:
        vocab: (iterable object) yields token
        txt_path: (stirng) path to vocab file
    """
    with open(txt_path, "w") as f:
        f.write("\n".join(token for token in vocab))

### Load and preprocess data, tokenise sentence, remove examples with empty questions. Split into training and test sets.

In [51]:
data = pd.read_csv('data/train.csv',
                   names=["id","qid1","qid2","question1","question2","is_duplicate"],
                   skiprows=1).fillna({'question1': ' ', 'question2': ' '})
data['question1'] = data['question1'].apply(lambda string: tokenize_string(string).casefold())
data['question2'] = data['question2'].apply(lambda string: tokenize_string(string).casefold())
data.drop(index=data[(data['question2'].apply(lambda s: len(s.split()))==0) | 
                     (data['question1'].apply(lambda s: len(s.split()))==0)].index, inplace=True)
data = data.reindex(index=np.random.permutation(data.index))
data['is_duplicate'] = data['is_duplicate'].astype(np.float32)
data, data_te = train_test_split(data, test_size=.1, stratify=data['is_duplicate'], random_state=17)

# Make vocabulary from training data
vocabulary = create_vocab(pd.concat((data['question1'], data['question2'])))
vocabulary_size = len(vocabulary)+1
save_vocab_to_txt_file(vocabulary, 'data/words.txt')

# Load and preprocess submission data
data_sub = pd.read_csv('data/test.csv').fillna({'question1': ' ', 'question2': ' '}) 
data_sub['test_id']=data_sub['test_id'].apply(pd.to_numeric, errors='coerce')
data_sub.dropna(subset=['test_id'], inplace=True)
data_sub['test_id'] = data_sub['test_id'].astype(int)
data_sub = data_sub.drop_duplicates(subset=['test_id'])

### Use GoogleNews-vectors-negative300.bin pre-trained word2vec model
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [None]:
# Use only word2vec embeddings for those words which are present in the training set 
model = gensim.models.KeyedVectors.load_word2vec_format('./w2v/GoogleNews-vectors-negative300.bin', binary=True) 
emb_size = 22 
embeddings = np.random.uniform(-.3, .3, (vocabulary_size+1, 300)).astype(np.float32)

for i, item in enumerate(vocabulary):
    if item in model:
        embeddings[i] = model[item]
# Only about 35 000 tokens from word2vec are used, 300 dimensions may be superfluous therefore
pca = PCA(n_components=emb_size)
embeddings = pca.fit_transform(embeddings)

### Data pipeline. The data is consumed from pandas DataFrame. The sentences are converted into series of word indicies which will further be used to look up word embeddings.

In [4]:
def build_vocab(file_name):
    words = tf.contrib.lookup.index_table_from_file(file_name, num_oov_buckets=1, delimiter='\n', name='vocab', )
    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, words.init)
    return words

def preprocess_sentence(sentence, vocab):
    return tf.data.Dataset.from_tensor_slices(sentence)  \
                .map(lambda string: tf.string_split([string]).values)  \
                .map(lambda tokens: (vocab.lookup(tokens)))

def load_dataset_from_pd(pd_dframe, vocab):
    """Create tf.data Instance from txt file
    Args:
        path_txt: (string) path containing one example per line
        vocab: (tf.lookuptable)
    Returns:
        dataset: (tf.Dataset) yielding list of ids of tokens for each example
    """
    sentence1 = preprocess_sentence(pd_dframe['question1'].values, vocab)
    sentence2 = preprocess_sentence(pd_dframe['question2'].values, vocab)
    return tf.data.Dataset.zip((sentence1, sentence2))

def input_fn(mode,
             features,
             labels,
             params):
    
    is_training_or_eval = (mode == 'train' or mode == 'eval')
    is_training = (mode == 'train')
    buffer_size = params['buffer_size'] if is_training else 1
    vocab = build_vocab(params['vocabulary_path'])
    id_pad_word = vocab.lookup(tf.constant('<PAD>'))
    features = load_dataset_from_pd(features, vocab)

    # sentence of unknown size
    padded_shapes = (
        (tf.TensorShape([None]), tf.TensorShape([None])),
                     tf.TensorShape([None])
    )    
    # sentence padded on the right with id_pad_word
    padding_values = ((id_pad_word, id_pad_word),
                      tf.constant(0, dtype=tf.float32))   
    labels = tf.data.Dataset.from_tensor_slices(labels.values[:, np.newaxis])
    dataset = tf.data.Dataset.zip((features, labels))

    if params['shuffle']:
        dataset = dataset.shuffle(buffer_size=buffer_size)
    dataset = dataset.repeat(params['num_epochs']).padded_batch(params['batch_size'],
                          padded_shapes=padded_shapes,
                          padding_values=padding_values)
    return dataset

### Build model: word index sequence -> embeddings sequence -> LSTM -> CNN layers -> rectification -> Max pooling -> softmax

In [43]:
def convolution_layer(net, n_kernels, k_size, stride):
    net = tf.layers.conv1d(net, n_kernels, [k_size], stride) 
    net = tf.nn.relu(net)
    net = tf.reduce_max(net, axis=-2)
    net = tf.reshape(net, [-1, n_kernels])
    return net

def sentence_representation(sentences, params):
    emb_const = tf.constant(params['embeddings'])
    embeddings = tf.get_variable('embeddings',
                                 initializer=emb_const
                                )
    net = tf.nn.embedding_lookup(embeddings, sentences)  
#     net = tf.layers.dropout(net, .2)
    lstm_cell = tf.contrib.rnn.LSTMCell(params['n_units_lstm'], activation=tf.nn.relu)
    lstm_o, lstm_state = tf.nn.dynamic_rnn(lstm_cell, net, dtype=net.dtype)
    # Stack outputs from convolution layers with different kernel sizes into a vector
    stack = tf.concat([convolution_layer(lstm_o, params['n_kernels'], size, stride=1)
                       for size in params['filter_sizes']],
                     axis=1)    
    return stack

def my_model(features, labels, mode, params):
    # create two branches with same variables
    sntc_repr = tf.make_template('sntc_repr', sentence_representation, params)
    flat1 = sntc_repr(features[0], params)
    flat2 = sntc_repr(features[1], params)
    
    net = tf.concat([flat1, flat2], 1)
#     net = tf.layers.dense(net, params['dense_units'], activation=tf.nn.relu)
#     net = tf.layers.dropout(net, 0.2)
    logits = tf.layers.dense(net, 1)
    probabilities = tf.nn.sigmoid(logits)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': probabilities,
            'logits': logits,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))


    metrics = {'auc': .5} #mockup
    tf.summary.scalar('auc', .5) #mockup

    
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=metrics)
    
    # Create training op
    assert mode == tf.estimator.ModeKeys.TRAIN

    optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
    optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

### Build estimator

In [47]:
data_params={'num_epochs': 30,
             'shuffle': True,
             'buffer_size': 300000,
             'batch_size': 1000,
             'vocabulary_path': 'data/words.txt'
            }

model_params = {'embedding_size': emb_size,
                'embeddings': embeddings,
                'n_units_lstm': 16,
                'n_kernels': 32,
                'vocabulary_size': vocabulary_size,
                'filter_sizes': [2, 3, 4, 5],
#                'dense_units': 80
               }

towers = tf.estimator.Estimator(model_fn=my_model,
                           params=model_params)

### Train estimator

In [48]:
s_time = time()
_ = towers.train(input_fn=lambda: input_fn(tf.estimator.ModeKeys.TRAIN,
                                      data,
                                      data['is_duplicate'],
                                      data_params),
                 steps=1
            )
print('Training time: ', time()-s_time)

Training time:  70.96544480323792


### Make predictions

In [32]:
data_params_pred={'num_epochs': 1,
             'shuffle': False,
             'buffer_size': 1,
             'batch_size': 500,
             'vocabulary_path': 'data/words.txt'
            }

In [34]:
s_time = time()
prd = towers.predict(input_fn=lambda: input_fn(tf.estimator.ModeKeys.PREDICT,
                                      data_te,
                                      pd.Series(np.zeros(data_te.shape[0], dtype=np.float32)),
                                      data_params_pred))

prd = np.array([item['probabilities'][0] for item in prd])
np.savetxt('prd.csv', prd)
print('Prediction completed') 
print('Test prediction time: ', time()-s_time)

Prediction completed


In [None]:
s_time = time()
sub = towers.predict(input_fn=lambda: input_fn(tf.estimator.ModeKeys.PREDICT,
                                      data_sub,
                                      pd.Series(np.zeros(data_sub.shape[0], dtype=np.float32)),
                                      data_params_pred))

sub = np.array([item['probabilities'][0] for item in sub])
np.savetxt('sub.csv', sub)
print('Submission completed') 
print('Submission prediction time: ', time()-s_time)

## Выводы
- Грубым подбором получены следующие гиперпараметры:
    - Размер словаря ~38 000 - слова, встречающиеся в обучающей выборке не менее 3 раз
    - Размерность векторов слов: 20
    - Размер слоя LSTM: 16
    - Размеры фильтров сверточного слоя: 2, 3, 4, 5
    - Количество фильтров в сверточном слое: 32 (для каждого размера)
    - Learning rate: 0.002
    - Batch size: 1000
- Loss по выборкам:

| - | train | test | private |
| --- | --- | --- | --- |
| log loss | ~$10^{-2}$ | 1.8 | 2.0 |
     
     Пути снижения степени оверфиттинга:
     - точный подбор гиперпараметров
     - увеличение датасета 
     - настройка регуляризации

- Один из недостатков модели - векторизация текста по словам:
    - требуется более тщательная предобработка текста (выделение корней, лемматизация)
    - слова с опечатками приходится выбрасывать из vocabulary <br>
    Для решения этих проблем можно реализовать векторизацию n соседних символов (например, триграмм).
    
- При использовании для слов из обучающей выборки предобученных word2vec с сокращением размерности векторов до 22 качество повысить не удалось. Полная матрица word2vec в рамках этой работы для ускорения расчетов не использовалась. 
