In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn

import pandas as pd
import numpy as np
import spacy

nlp = spacy.load('en')

dtypes ={
    'id': np.uint32,
    'qid1': np.uint32,
    'qid2': np.uint32,
    'question1': np.str,
    'question2': np.str,
    'is_duplicate': np.uint8
}

df_chunks = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], iterator=True, chunksize=1000)
df_test = pd.read_csv('../../dataset/quora_train.csv.zip', dtype=dtypes, compression='zip',
                 usecols=['question1', 'question2', 'is_duplicate'], nrows=1000)

In [10]:
def parse_df(df):
    def pad(v):
        zeros = np.zeros([100, 300])
        zeros[:v.shape[0],:v.shape[1]] = v
        return zeros

    q1s = df['question1'].map(nlp).map(lambda x: np.matrix([t.vector for t in x if not t.is_stop])).map(pad)
    q2s = df['question1'].map(nlp).map(lambda x: np.matrix([t.vector for t in x if not t.is_stop])).map(pad)

    q1m = np.concatenate(q1s.values).flatten().reshape([-1, 100, 300])
    q2m = np.concatenate(q2s.values).flatten().reshape([-1, 100, 300])

    labels = df['is_duplicate'].values
    
    return q1m, q2m, labels

In [3]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

tf.reset_default_graph()

Q1 = tf.placeholder(tf.float32, shape=[None, 100, 300])
Q2 = tf.placeholder(tf.float32, shape=[None, 100, 300])
batch_size = tf.placeholder(tf.int32)
is_dup = tf.placeholder(tf.uint8, shape=[None])
dup_oh = tf.one_hot(is_dup, 2)

cell1 = rnn.BasicLSTMCell(64, forget_bias=0.0)
cell2 = rnn.BasicLSTMCell(64, forget_bias=0.0, reuse=True)

state = cell1.zero_state(batch_size, dtype=tf.float32)

#Q1t = tf.transpose(Q1, [1, 0, 2])
#Q2t = tf.transpose(Q2, [1, 0, 2])

outputs1, _1= tf.nn.dynamic_rnn(cell1, Q1, sequence_length=length(Q1), initial_state=state)
outputs2, _2= tf.nn.dynamic_rnn(cell2, Q2, sequence_length=length(Q2), initial_state=state)

output11 = tf.reduce_mean(outputs1[:,0:25,:], 1)
output12 = tf.reduce_mean(outputs1[:,26:50,:], 1)
output13 = tf.reduce_mean(outputs1[:,51:75,:], 1)
output14 = tf.reduce_mean(outputs1[:,76:100,:], 1)
output1 = tf.concat([output11, output12, output13, output14], axis=1)

output21 = tf.reduce_mean(outputs2[:,0:25,:], 1)
output22 = tf.reduce_mean(outputs2[:,26:50,:], 1)
output23 = tf.reduce_mean(outputs2[:,51:75,:], 1)
output24 = tf.reduce_mean(outputs2[:,76:100,:], 1)
output2 = tf.concat([output21, output22, output23, output24], axis=1)

out = tf.concat([output1, output2], axis=1)

In [4]:
keep_prob = tf.placeholder(tf.float32)
#out = tf.nn.dropout(out, keep_prob=keep_prob)

W1 = tf.get_variable(shape=[512, 256], initializer=tf.contrib.layers.xavier_initializer(), name='weight1')
b1 = tf.get_variable(shape=[256], initializer=tf.contrib.layers.xavier_initializer(), name='bias1')
y1 = tf.nn.relu(tf.matmul(out, W1) + b1)
y1 = tf.nn.dropout(y1, keep_prob=keep_prob)

W2 = tf.get_variable(shape=[256, 2], initializer=tf.contrib.layers.xavier_initializer(), name='weight2')
b2 = tf.get_variable(shape=[2], initializer=tf.contrib.layers.xavier_initializer(), name='bias2')
y2 = tf.matmul(y1, W2) + b2

In [5]:
cost = tf.nn.softmax_cross_entropy_with_logits(logits=y2, labels=dup_oh)
cost = tf.reduce_mean(cost)

optimizer = tf.train.AdamOptimizer(learning_rate=0.03).minimize(cost)

In [6]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [11]:
from tqdm import tqdm

test_q1m, test_q2m, test_labels = parse_df(df_test)
test_dict = {Q1: test_q1m, Q2: test_q2m, is_dup: test_labels, keep_prob: 1.0, batch_size: test_q1m.shape[0]}

count = 0
for df in tqdm(df_chunks):
    train_q1m, train_q2m, train_labels = parse_df(df)
    train_dict = {Q1: train_q1m, Q2: train_q2m, is_dup: train_labels, keep_prob: 0.5, batch_size: train_q1m.shape[0]}
    sess.run(optimizer, feed_dict=train_dict)
    
    if count % 20 == 0:
        print(sess.run(cost, feed_dict=train_dict), sess.run(cost, feed_dict=test_dict))
    count += 1

1it [00:03,  3.17s/it]

1.30354 1.34357


21it [00:45,  2.34s/it]

0.633345 0.636714


41it [01:28,  2.34s/it]

0.621313 0.611604


61it [02:10,  2.32s/it]

0.585233 0.599649


81it [02:53,  2.34s/it]

0.579607 0.604288


101it [03:35,  2.31s/it]

0.588073 0.590354


107it [03:47,  2.13s/it]

KeyboardInterrupt: 

In [None]:
variables_names =[v.name for v in tf.trainable_variables()]
values = sess.run(variables_names)
for k,v in zip(variables_names, values):
    print(k, v)

In [14]:
output1

<tf.Tensor 'concat:0' shape=(?, 256) dtype=float32>

In [12]:
output2

<tf.Tensor 'Mean_1:0' shape=(?, 64) dtype=float32>