In [33]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from string import punctuation

In [34]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.0.0
Default GPU Device: /gpu:0


In [35]:
train = pd.read_csv('train.csv').fillna("Empty")

In [36]:
print(len(train))
train.head()

404290


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [37]:
question_words = train['question1'].apply(lambda x: x.split()[0])

In [38]:
value_counts = question_words.value_counts()
value_counts = value_counts[value_counts>500]
value_counts

What      137930
How        93409
Why        32808
Is         22485
Which      17468
Can        11605
I           7906
Who         7547
Do          6789
Where       6641
If          5961
What's      5638
Does        5090
Are         4542
When        3227
Should      3129
Will        2981
In          1967
My          1723
Did         1285
I'm         1165
Has         1100
Would       1053
Have         887
Was          783
Could        719
As           582
Name: question1, dtype: int64

In [61]:
sentence_lengths1 = train['question1'].apply(len)
sentence_lengths2 = train['question2'].apply(len)
print(len(sentence_lengths1))
print(len(sentence_lengths2))
sentence_lengths = sentence_lengths1.append(sentence_lengths2)
print("Max sentence length %.2f" % np.max(sentence_lengths))
print("Min sentence length %.2f" % np.min(sentence_lengths))
print("Average sentence length %.2f" % np.mean(sentence_lengths))
print("Median senctence length %.2f" % np.median(sentence_lengths))
print("Standard deviation %.2f" % np.std(sentence_lengths))
print("Mean plus 2 X STD %.2f" % (2 * np.std(sentence_lengths) + np.mean(sentence_lengths)))

max_words = 40
greater_than = sentence_lengths[sentence_lengths > max_words]
print("Percent greater than max_words %.2f" % (len(greater_than)/ len(sentence_lengths)))

404290
404290
Max sentence length 1169.00
Min sentence length 1.00
Average sentence length 59.82
Median senctence length 51.00
Standard deviation 31.96
Mean plus 2 X STD 123.75
Percent greater than max_words 0.72


# Train and Test Splits

In [62]:
# train_data, test_data = train_test_split(train, test_size=0.25)
train_questions1 = train['question1'].tolist()
train_questions2 = train['question2'].tolist()
train_data_labels = train['is_duplicate'].tolist()

In [63]:
def build_int_dict(questions, max_vocab):
    all_questions = ' '.join([c.lower().translate(str.maketrans("", "", punctuation)) for c in questions])
    words = all_questions.split()
    from collections import Counter
    vocab = Counter(words).most_common(max_vocab)
    vocab_to_int = {word[0]: ii for ii, word in enumerate(vocab, 1)}
    vocab_to_int[''] = len(vocab_to_int.keys()) + 1
    vocab_to_int['<UNK>'] = len(vocab_to_int.keys()) + 1
    
    return vocab_to_int

In [64]:
max_vocab = 80000
train_questions = train_questions1[:]
train_questions.extend(train_questions2)

vocab_to_int_dict = build_int_dict(train_questions, max_vocab)
print(len(vocab_to_int_dict.keys()))

80002


In [65]:
print(train_questions1[0:10])
print(len(vocab_to_int_dict))

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'How can I increase the speed of my internet connection while using a VPN?', 'Why am I mentally very lonely? How can I solve it?', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', 'Should I buy tiago?', 'How can I be a good geologist?', 'When do you use シ instead of し?', 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']
80002


In [66]:
def convert_to_ints(questions, seq_len, vocab_to_int):
    all_questions_ints = []
    for i, each in enumerate(questions):
        question_ints = []
        if i % 100000 == 0:
            print(i)
        for word in each.split():
            if word.lower().translate(str.maketrans("","",punctuation)) in vocab_to_int.keys():
                question_ints.append(vocab_to_int[word.lower().translate(str.maketrans("","",punctuation))])
            else:
                question_ints.append(vocab_to_int['<UNK>'])
        all_questions_ints.append(question_ints)
    
    print("Finished converting to ints. Now padding...")
    features = np.zeros((len(all_questions_ints), seq_len), dtype=int)
    for i, row in enumerate(all_questions_ints):
        features[i, -len(row):] = np.array(row)[:seq_len]
        
    return features

In [67]:
question_ints1 = convert_to_ints(train_questions1, max_words, vocab_to_int_dict)
question_ints2 = convert_to_ints(train_questions2, max_words, vocab_to_int_dict)

0
100000
200000
300000
400000
Finished converting to ints. Now padding...
0
100000
200000
300000
400000
Finished converting to ints. Now padding...


In [68]:
# !mkdir checkpoints

In [69]:
split_frac = 0.8
split_idx = int(len(question_ints1)*split_frac)
train_q1, val_q1 = question_ints1[:split_idx], question_ints1[split_idx:]
train_q2, val_q2 = question_ints2[:split_idx], question_ints2[split_idx:]
train_y, val_y = train_data_labels[:split_idx], train_data_labels[split_idx:]
train_y = np.array(train_y)
val_y = np.array(val_y)

test_idx = int(len(val_q1)*0.5)
val_q1, test_q1 = val_q1[:test_idx], val_q1[test_idx:]
val_q2, test_q2 = val_q2[:test_idx], val_q2[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_q1.shape), 
      "\nValidation set: \t{}".format(val_q1.shape),
      "\nTest set: \t\t{}".format(test_q1.shape))

			Feature Shapes:
Train set: 		(323432, 40) 
Validation set: 	(40429, 40) 
Test set: 		(40429, 40)


In [71]:
lstm_size = 250
lstm_layers = 1
batch_size = 256
learning_rate = 0.002
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 
seq_length = max_words
num_dense = 150

In [72]:
# n_words1 = len(question1_vocab_to_int)
# n_words2 = len(question2_vocab_to_int)

# # Create the graph object
# graph = tf.Graph()
# # Add nodes to the graph
# with graph.as_default():
#     inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
#     keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [73]:
n_words = len(vocab_to_int_dict) + 1

tf.reset_default_graph()

# Create the graph object
# graph = tf.Graph()
# Add nodes to the graph
with tf.variable_scope('questions', reuse=None):
    inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
    inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    seq_len = tf.placeholder(tf.int32, [None])
    

    embedding1 = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed1 = tf.nn.embedding_lookup(embedding1, inputs_1)
    
    # Your basic LSTM cell
    lstm_fwd1 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    lstm_bwd1 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop_fwd1 = tf.contrib.rnn.DropoutWrapper(lstm_fwd1, output_keep_prob=keep_prob)
    drop_bwd1 = tf.contrib.rnn.DropoutWrapper(lstm_bwd1, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell_fwd1 = tf.contrib.rnn.MultiRNNCell([drop_fwd1] * lstm_layers)
    cell_bwd1 = tf.contrib.rnn.MultiRNNCell([drop_bwd1] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state_fwd1 = cell_fwd1.zero_state(batch_size, tf.float32)
    initial_state_bwd1 = cell_bwd1.zero_state(batch_size, tf.float32)

    outputs1, final_state1 = tf.nn.bidirectional_dynamic_rnn(cell_fwd1,cell_bwd1, embed1,
                                             initial_state_fw=initial_state_fwd1,
                                             initial_state_bw=initial_state_bwd1, sequence_length=seq_len)
    
with tf.variable_scope('questions', reuse=True):
    embedding2 = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed2 = tf.nn.embedding_lookup(embedding2, inputs_2)
    
    # Your basic LSTM cell
    lstm_fwd2 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    lstm_bwd2 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop_fwd2 = tf.contrib.rnn.DropoutWrapper(lstm_fwd2, output_keep_prob=keep_prob)
    drop_bwd2 = tf.contrib.rnn.DropoutWrapper(lstm_bwd2, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell_fwd2 = tf.contrib.rnn.MultiRNNCell([drop_fwd2] * lstm_layers)
    cell_bwd2 = tf.contrib.rnn.MultiRNNCell([drop_bwd2] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state_fwd2 = cell_fwd2.zero_state(batch_size, tf.float32)
    initial_state_bwd2 = cell_bwd2.zero_state(batch_size, tf.float32)
    
    outputs2, final_state2 = tf.nn.bidirectional_dynamic_rnn(cell_fwd2, cell_bwd2, embed2,
                                             initial_state_fw=initial_state_fwd2,
                                             initial_state_bw=initial_state_bwd2,sequence_length=seq_len)

In [74]:
with tf.variable_scope('questions', reuse=False):
    outputs1 = tf.concat(outputs1, 2)
    outputs2 = tf.concat(outputs2, 2)
    outputs = tf.concat([outputs1[:,-1], outputs2[:,-1]], 1)
    final_state = tf.concat([final_state1, final_state2], 1)
    
    dense_layer = tf.contrib.layers.fully_connected(outputs, num_dense, activation_fn=tf.nn.relu)
    drop_dense = tf.layers.dropout(dense_layer, rate=keep_prob)
    predictions = tf.contrib.layers.fully_connected(drop_dense, 1, activation_fn=tf.sigmoid)
    cost = tf.losses.log_loss(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [75]:
with tf.variable_scope('questions', reuse=False):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [76]:
def get_batches(x1, x2, y, batch_size=100):
    if y != None:
        n_batches = len(x1)//batch_size
        x1, x2, y = x1[:n_batches*batch_size], x2[:n_batches*batch_size], y[:n_batches*batch_size]
        for ii in range(0, len(x1), batch_size):
            yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], y[ii:ii+batch_size]
    else:
        n_batches = len(x1)//batch_size
        x1, x2, _ = x1[:n_batches*batch_size], x2[:n_batches*batch_size], None
        for ii in range(0, len(x1), batch_size):
            yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], None

In [None]:
epochs = 3
t_vars = tf.trainable_variables()
saver = tf.train.Saver(var_list=t_vars)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state_fwd1 = sess.run(initial_state_fwd1)
        state_bwd1 = sess.run(initial_state_bwd1)
        state_fwd2 = sess.run(initial_state_fwd2)
        state_bwd2 = sess.run(initial_state_bwd2)
        
        for ii, (x1, x2, y) in enumerate(get_batches(train_q1, train_q2, train_y, batch_size), 1):
            train_seq_len = np.ones(batch_size) * seq_length
            feed = {inputs_1: x1,
                    inputs_2: x2,
                    labels_: y[:,None],
                    keep_prob: 0.6,
                    initial_state_fwd1: state_fwd1,
                    initial_state_bwd1: state_bwd1,
                    initial_state_fwd2: state_fwd2,
                    initial_state_bwd2: state_bwd2,
                    seq_len:train_seq_len}
            loss, state1, state2,_ = sess.run([cost, final_state1, final_state2, optimizer], feed_dict=feed)
            
            if iteration%10==0:
                print("Epoch: {}/{}".format(e + 1, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%200==0:
                val_acc = []
                val_state_fwd1 = sess.run(cell_fwd1.zero_state(batch_size, tf.float32))
                val_state_bwd1 = sess.run(cell_bwd1.zero_state(batch_size, tf.float32))
                val_state_fwd2 = sess.run(cell_fwd2.zero_state(batch_size, tf.float32))
                val_state_bwd2 = sess.run(cell_bwd2.zero_state(batch_size, tf.float32))
                for x1, x2, y in get_batches(val_q1, val_q2, val_y, batch_size):
                    feed = {inputs_1: x1,
                            inputs_2: x2,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state_fwd1: val_state_fwd1,
                            initial_state_fwd1: val_state_bwd1,
                            initial_state_fwd2: val_state_fwd2,
                            initial_state_fwd2: val_state_bwd2,
                            seq_len:train_seq_len}
                    batch_acc, val_state1, val_state2 = sess.run([cost, final_state1, final_state2], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/duplicates2.ckpt")

  from ipykernel import kernelapp as app


Epoch: 1/3 Iteration: 10 Train loss: 0.677
Epoch: 1/3 Iteration: 20 Train loss: 0.640
Epoch: 1/3 Iteration: 30 Train loss: 0.593
Epoch: 1/3 Iteration: 40 Train loss: 0.571
Epoch: 1/3 Iteration: 50 Train loss: 0.514


In [None]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state_fwd1 = sess.run(initial_state_fwd1)
    test_state_bwd1 = sess.run(initial_state_bwd1)
    test_state_fwd2 = sess.run(initial_state_fwd2)
    test_state_bwd2 = sess.run(initial_state_bwd2)
    for ii, (x1, x2, y) in enumerate(get_batches(test_q1, test_q2, test_y, batch_size), 1):
        feed = {inputs_1: x1,
                inputs_2: x2,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state_fwd1: test_state_fwd1,
                initial_state_bwd1: test_state_bwd1,
                initial_state_fwd2: test_state_fwd2,
                initial_state_bwd2: test_state_bwd2,
                seq_len:train_seq_len}
        batch_acc, test_state1, test_state2 = sess.run([cost, final_state1, final_state2], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

In [44]:
test_set = pd.read_csv('test.csv').fillna("Empty")
print(len(test_set))

2345796


In [45]:
test_questions1 = test_set['question1'].tolist()
test_questions2 = test_set['question2'].tolist()
test_question_ints1 = convert_to_ints(test_questions1, max_words, vocab_to_int_dict)
test_question_ints2 = convert_to_ints(test_questions2, max_words, vocab_to_int_dict)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
Finished converting to ints. Now padding...
0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
Finished converting to ints. Now padding...


In [None]:
# Make predictions

predictions = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state1 = sess.run(initial_state1)
    test_state2 = sess.run(initial_state2)
    for ii, (x1, x2, _) in enumerate(get_batches(test_question_ints1, test_question_ints2, None, batch_size), 1):
        feed = {inputs_1: x1,
                inputs_2: x2,
                keep_prob: 1}
        prediction = sess.run([predictions], feed_dict=feed)
        predictions.append(prediction)

In [None]:
print(len(test_set))
print(len(predictions))