In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn

import pandas as pd
import numpy as np
import spacy

nlp = spacy.load('en')

dtypes ={
    'id': np.uint32,
    'qid1': np.uint32,
    'qid2': np.uint32,
    'question1': np.str,
    'question2': np.str,
    'is_duplicate': np.uint8
}

df_chunks = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], iterator=True, chunksize=100)
df_test = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], nrows=1000)

In [2]:
def parse_df(df):
    def pad(v):
        zeros = np.zeros([100, 300])
        zeros[:v.shape[0],:v.shape[1]] = v
        return zeros

    def sorted_vec(doc):
        t_list = [t for t in doc if not t.is_stop]
        t_list = [t.vector for t in t_list]

        return np.matrix(t_list)

    q1s = df['question1'].map(nlp).map(sorted_vec).map(pad)
    q2s = df['question1'].map(nlp).map(sorted_vec).map(pad)

    q1m = np.concatenate(q1s.values).flatten().reshape([-1, 100, 300])
    q2m = np.concatenate(q2s.values).flatten().reshape([-1, 100, 300])

    labels = df['is_duplicate'].values

    return q1m, q2m, labels

In [3]:
tf.reset_default_graph()
keep_prob = tf.placeholder(tf.float32)

def lstm_cell():
    cell = tf.contrib.rnn.BasicLSTMCell(128)
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

def lstm_cell2():
    cell = tf.contrib.rnn.BasicLSTMCell(128, reuse=True)
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

Q1 = tf.placeholder(tf.float32, shape=[None, 100, 300])
Q2 = tf.placeholder(tf.float32, shape=[None, 100, 300])
batch_size = tf.placeholder(tf.int32)
is_dup = tf.placeholder(tf.uint8, shape=[None])
dup_oh = tf.one_hot(is_dup, 2)

cell1s = tf.contrib.rnn.MultiRNNCell([lstm_cell() for i in range(5)])
cell2s = tf.contrib.rnn.MultiRNNCell([lstm_cell2() for i in range(5)])

state = cell1s.zero_state(batch_size, dtype=tf.float32)

outputs1, _1= tf.nn.dynamic_rnn(cell1s, Q1, sequence_length=length(Q1), initial_state=state)
outputs2, _2= tf.nn.dynamic_rnn(cell2s, Q2, sequence_length=length(Q2), initial_state=state)

outputs1 = tf.reshape(outputs1, [-1, 128])
outputs2 = tf.reshape(outputs2, [-1, 128])

sW = tf.get_variable(shape=[128, 128], initializer=tf.contrib.layers.xavier_initializer(), name='s-weight')
sb = tf.get_variable(shape=[128], initializer=tf.contrib.layers.xavier_initializer(), name='s-bias')

outputs1 = tf.nn.relu(tf.matmul(outputs1, sW) + sb)
outputs1 = tf.reshape(outputs1, [-1, 100, 128])
outputs1 = tf.nn.dropout(outputs1, keep_prob=keep_prob)

outputs2 = tf.nn.relu(tf.matmul(outputs2, sW) + sb)
outputs2 = tf.reshape(outputs2, [-1, 100, 128])
outputs2 = tf.nn.dropout(outputs2, keep_prob=keep_prob)

output11 = tf.reduce_mean(outputs1[:,0:25,:], 1)
output12 = tf.reduce_mean(outputs1[:,26:50,:], 1)
output13 = tf.reduce_mean(outputs1[:,51:75,:], 1)
output14 = tf.reduce_mean(outputs1[:,76:100,:], 1)
outputs1 = tf.concat([output11, output12, output13, output14], axis=1)

output21 = tf.reduce_mean(outputs2[:,0:25,:], 1)
output22 = tf.reduce_mean(outputs2[:,26:50,:], 1)
output23 = tf.reduce_mean(outputs2[:,51:75,:], 1)
output24 = tf.reduce_mean(outputs2[:,76:100,:], 1)
outputs2 = tf.concat([output21, output22, output23, output24], axis=1)

out = tf.concat([outputs1, outputs2], axis=1)

In [4]:
out = tf.nn.dropout(out, keep_prob=keep_prob)

W1 = tf.get_variable(shape=[1024, 256], initializer=tf.contrib.layers.xavier_initializer(), name='weight1')
b1 = tf.get_variable(shape=[256], initializer=tf.contrib.layers.xavier_initializer(), name='bias1')
y1 = tf.nn.relu(tf.matmul(out, W1) + b1)
y1 = tf.nn.dropout(y1, keep_prob=keep_prob)

W2 = tf.get_variable(shape=[256, 2], initializer=tf.contrib.layers.xavier_initializer(), name='weight2')
b2 = tf.get_variable(shape=[2], initializer=tf.contrib.layers.xavier_initializer(), name='bias2')
y2 = tf.matmul(y1, W2) + b2

In [5]:
cost = tf.nn.softmax_cross_entropy_with_logits(logits=y2, labels=dup_oh)
cost = tf.reduce_mean(cost)

optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
from tqdm import tqdm

test_q1m, test_q2m, test_labels = parse_df(df_test)
test_dict = {Q1: test_q1m, Q2: test_q2m, is_dup: test_labels, keep_prob: 1.0, batch_size: test_q1m.shape[0]}

count = 0

while True:
    for df in df_chunks:
        train_q1m, train_q2m, train_labels = parse_df(df)
        train_dict = {Q1: train_q1m, Q2: train_q2m, is_dup: train_labels, keep_prob: 0.5, batch_size: train_q1m.shape[0]}

        sess.run(optimizer, feed_dict=train_dict)

        if count % 500 == 0:
            print(sess.run(cost, feed_dict=test_dict))
        count += 1

0.686211
0.610931
0.598988
0.587054
0.581664
0.57768
0.57812
0.572426
0.562285


In [None]:
variables_names =[v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k,v in zip(variables_names, values):
    print(k, v)

In [14]:
output1

<tf.Tensor 'concat:0' shape=(?, 256) dtype=float32>

In [12]:
output2

<tf.Tensor 'Mean_1:0' shape=(?, 64) dtype=float32>