In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn

import pandas as pd
import numpy as np
import spacy

nlp = spacy.load('en')

dtypes ={
    'id': np.uint32,
    'qid1': np.uint32,
    'qid2': np.uint32,
    'question1': np.str,
    'question2': np.str,
    'is_duplicate': np.uint8
}

df_chunks = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], iterator=True, chunksize=100)
df_test = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], nrows=1000)

In [2]:
def parse_df(df):
    def pad(v):
        zeros = np.zeros([200, 300])
        zeros[:v.shape[0],:v.shape[1]] = v
        return zeros

    def sorted_vec(doc):
        t_list = [t for t in doc if not t.is_stop]
        t_list = [t.vector for t in t_list]
        return np.matrix(t_list)

    q1s = df['question1'].map(str).map(nlp).map(sorted_vec)
    q1l = q1s.map(lambda x: x.shape[0])
    q1s = q1s.map(pad)
    
    q2s = df['question2'].map(str).map(nlp).map(sorted_vec)
    q2l = q2s.map(lambda x: x.shape[0])
    q2s = q2s.map(pad)
    
    q1m = np.stack(q1s.values)
    q2m = np.stack(q2s.values)
    labels = df['is_duplicate'].values

    return q1m, q2m, q1l, q2l, labels

In [8]:
tf.reset_default_graph()
keep_prob = tf.placeholder(tf.float32)

def extract_axis_1(data, ind):
    batch_range = tf.range(tf.shape(data)[0])
    indices = tf.stack([batch_range, ind], axis=1)
    res = tf.gather_nd(data, indices)
    return res

def lstm_cell():
    cell = tf.contrib.rnn.BasicLSTMCell(256)
    #return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    return cell

def lstm_cell2():
    cell = tf.contrib.rnn.BasicLSTMCell(256, reuse=True)
    #return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    return cell

def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

Q1 = tf.placeholder(tf.float32, shape=[None, 200, 300])
Q2 = tf.placeholder(tf.float32, shape=[None, 200, 300])

Q1l = tf.placeholder(tf.int32, shape=[None])
Q2l = tf.placeholder(tf.int32, shape=[None])

batch_size = tf.placeholder(tf.int32)
is_dup = tf.placeholder(tf.uint8, shape=[None])
dup_oh = tf.one_hot(is_dup, 2)

cell1s = tf.contrib.rnn.MultiRNNCell([lstm_cell() for i in range(5)])
cell2s = tf.contrib.rnn.MultiRNNCell([lstm_cell2() for i in range(5)])

state = cell1s.zero_state(batch_size, dtype=tf.float32)

outputs1, l_state1= tf.nn.dynamic_rnn(cell1s, Q1, sequence_length=Q1l, initial_state=state)
outputs2, l_state2= tf.nn.dynamic_rnn(cell2s, Q2, sequence_length=Q2l, initial_state=state)

sW = tf.get_variable(shape=[256, 256], initializer=tf.contrib.layers.xavier_initializer(), name='s-weight')
sb = tf.get_variable(shape=[256], initializer=tf.contrib.layers.xavier_initializer(), name='s-bias')

o1 = tf.matmul(l_state1[-1].h, sW) + sb
#o1 = tf.nn.dropout(o1, keep_prob=keep_prob)

o2 =tf.matmul(l_state2[-1].h, sW) + sb
#o2 = tf.nn.dropout(o2, keep_prob=keep_prob)

out = tf.concat([o1, o2], axis=1)

In [9]:
#out = tf.nn.dropout(out, keep_prob=keep_prob)

W1 = tf.get_variable(shape=[512, 512], initializer=tf.contrib.layers.xavier_initializer(), name='weight1')
b1 = tf.get_variable(shape=[512], initializer=tf.contrib.layers.xavier_initializer(), name='bias1')
y1 = tf.nn.relu(tf.matmul(out, W1) + b1)
y1 = tf.nn.dropout(y1, keep_prob=keep_prob)

W2 = tf.get_variable(shape=[512, 2], initializer=tf.contrib.layers.xavier_initializer(), name='weight2')
b2 = tf.get_variable(shape=[2], initializer=tf.contrib.layers.xavier_initializer(), name='bias2')
y2 = tf.matmul(y1, W2) + b2

In [10]:
cost = tf.nn.softmax_cross_entropy_with_logits(logits=y2, labels=dup_oh)
cost = tf.reduce_mean(cost)

optimizer = tf.train.AdamOptimizer(learning_rate=0.01).minimize(cost)

In [11]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
from tqdm import tqdm

test_q1m, test_q2m, test_q1l, test_q2l, test_labels = parse_df(df_test)
test_dict = {Q1: test_q1m, Q2: test_q2m, Q1l: test_q1l, Q2l: test_q2l, is_dup: test_labels, keep_prob: 1.0, batch_size: test_q1m.shape[0]}

count = 0

while True:
    df_chunks = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], iterator=True, chunksize=100)
    for df in tqdm(df_chunks):
        train_q1m, train_q2m, train_q1l, train_q2l, train_labels = parse_df(df)
        train_dict1 = {Q1: train_q1m, Q2: train_q2m, Q1l: train_q1l, Q2l: train_q2l, is_dup: train_labels, keep_prob: 0.7, batch_size: train_q1m.shape[0]}
        train_dict2 = {Q1: train_q2m, Q2: train_q1m, Q1l: train_q2l, Q2l: train_q1l, is_dup: train_labels, keep_prob: 0.7, batch_size: train_q1m.shape[0]}

        sess.run(optimizer, feed_dict=train_dict1)
        sess.run(optimizer, feed_dict=train_dict2)

        if count % 100 == 0:
            print(sess.run(cost, feed_dict=train_dict1), sess.run(cost, feed_dict=test_dict))

        count += 1

1it [00:04,  4.68s/it]

0.815576 0.806573


101it [03:56,  2.94s/it]

0.618459 0.625201


190it [07:48,  2.23s/it]

In [None]:
variables_names =[v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k,v in zip(variables_names, values):
    print(k, v)

In [25]:
test_q1m, test_q2m, test_labels = parse_df(df_test)
test_dict = {Q1: test_q1m, Q2: test_q2m, is_dup: test_labels, keep_prob: 1.0, batch_size: test_q1m.shape[0]}

print(sess.run(length(Q1), feed_dict=test_dict))
sess.run(Q1, feed_dict=test_dict)[3][1]

[ 7  3  5  2  4  8  1  1  2  2  4  5  3  2 11  4  3  7  7  3  3  3  3  2  7
  6  3  5  4  2  4  5  5  9  3  6  6  9  2  4  2  3  4  6  5  4  5  7  6  3
  4  3  1  4  3  2  2  4  9  3  5  4  3  2  4  5  3  4  4  8  2  3  2 10  4
  8  4  5  3  3  7  4  9  2  5  2  5  3  3  5  5  4  3  2  2  3  2  2 12  3
  5  3  6  3  4  1  4  4  4  6  1  3  4  4  4  4  5  5  9  2  4  5  2  3  4
  6  3  4  7  2  1  3  5  5  4  2  4  3  3  3  2  2  1  4  8  5  2  6  3  5
  2  3  4  2  1  1 10  3  5  2  3 10  4  5  3  2  3  5  3  7  3  2  4  5  4
  3  2  3  4  3  3  3  4  2  4  8  5  8  4  4  3  2  3  5  5  2  4  5  1  5
  3  6  4  3  9 10  3  5  6  2  6 14  6  3  6  3  4  4  5  3  2  2  2  5  3
  5  9  3  3  6  4  4  7  5  7  3  3  5  5  4  2 11  5  4  6  4  1  2  9  5
  2  3  5  3  3  1  4  5  5  5  7  5  3  4  2  3  8  9  4  2  2  4 12  5  4
  5  5  3  4  2  2  5  6  6  3  4  2  3  4  5  4  5  1  2  2  6  2  4  6  2
  2  2  3  2  2  8  7  4  3  2  3  2  7  3  3  3  9  5  1  3 15  4  3  3  4
  4  5  2  2

array([ -3.51859987e-01,   2.73759991e-01,  -3.39630008e-01,
        -3.88150007e-01,  -4.25330013e-01,  -4.08050001e-01,
         1.24700002e-01,   7.92670026e-02,   5.66660017e-02,
         1.89230001e+00,  -4.44370002e-01,  -5.90170026e-01,
        -2.01329999e-02,   1.26120001e-01,  -2.59180009e-01,
        -3.94659996e-01,  -3.68379988e-02,   1.20550001e+00,
        -1.89879999e-01,   2.62180001e-01,   4.81330007e-01,
         1.39620006e-01,   3.37630004e-01,  -4.72519994e-02,
         1.21480003e-01,  -7.33990014e-01,   6.50849998e-01,
        -2.09180005e-02,  -1.98510006e-01,  -1.54819995e-01,
        -1.35920003e-01,   1.41340002e-01,   1.41700000e-01,
        -6.11919984e-02,   4.25099999e-01,  -2.00260002e-02,
         8.82160008e-01,   1.62630007e-01,   2.53509998e-01,
        -5.33110023e-01,   2.27740005e-01,  -3.65530014e-01,
        -1.59669995e-01,  -1.34299994e-01,  -3.44110012e-01,
         6.99680001e-02,   6.38230005e-03,   2.62230009e-01,
        -5.19580007e-01,

In [32]:
diff

<tf.Tensor 'Sum:0' shape=(?, 1) dtype=float32>

In [29]:
o1 - o2

<tf.Tensor 'sub_1:0' shape=(?, 256) dtype=float32>