In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from string import punctuation

In [2]:
train = pd.read_csv('train.csv').fillna("Empty")

In [3]:
print(len(train))
train.head()

404290


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
question_words = train['question1'].apply(lambda x: x.split()[0])

In [5]:
value_counts = question_words.value_counts()
value_counts = value_counts[value_counts>500]
value_counts

What      137930
How        93409
Why        32808
Is         22485
Which      17468
Can        11605
I           7906
Who         7547
Do          6789
Where       6641
If          5961
What's      5638
Does        5090
Are         4542
When        3227
Should      3129
Will        2981
In          1967
My          1723
Did         1285
I'm         1165
Has         1100
Would       1053
Have         887
Was          783
Could        719
As           582
Name: question1, dtype: int64

In [6]:
sentence_lengths1 = train['question1'].apply(len)
sentence_lengths2 = train['question2'].apply(len)
print(len(sentence_lengths1))
print(len(sentence_lengths2))
sentence_lengths = sentence_lengths1.append(sentence_lengths2)
print("Max sentence length %.2f" % np.max(sentence_lengths))
print("Min sentence length %.2f" % np.min(sentence_lengths))
print("Average sentence length %.2f" % np.mean(sentence_lengths))
print("Median senctence length %.2f" % np.median(sentence_lengths))
print("Standard deviation %.2f" % np.std(sentence_lengths))
print("Mean plus 2 X STD %.2f" % (2 * np.std(sentence_lengths) + np.mean(sentence_lengths)))

max_words = 100
greater_than = sentence_lengths[sentence_lengths > max_words]
print("Percent greater than max_words %.2f" % (len(greater_than)/ len(sentence_lengths)))

404290
404290
Max sentence length 1169.00
Min sentence length 1.00
Average sentence length 59.82
Median senctence length 51.00
Standard deviation 31.96
Mean plus 2 X STD 123.75
Percent greater than max_words 0.10


# Train and Test Splits

In [7]:
# train_data, test_data = train_test_split(train, test_size=0.25)
train_questions1 = train['question1'].tolist()
train_questions2 = train['question2'].tolist()
train_data_labels = train['is_duplicate'].tolist()

In [8]:
def convert_to_ints(questions, seq_len):
    all_questions = ' '.join([c.translate(str.maketrans("", "", punctuation)) for c in questions])
    words = all_questions.split()
    from collections import Counter
    counts = Counter(words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
    vocab_to_int[''] = len(vocab_to_int.keys()) + 1

    questions_ints = []
    for each in questions:
        questions_ints.append([vocab_to_int[word.translate(str.maketrans("", "", punctuation))] for word in each.split()])

    features = np.zeros((len(questions_ints), seq_len), dtype=int)
    for i, row in enumerate(questions_ints):
        features[i, -len(row):] = np.array(row)[:seq_len]
        
    return features, vocab_to_int

In [9]:
question_ints1, question1_vocab_to_int = convert_to_ints(train_questions1, max_words)
question_ints2, question2_vocab_to_int = convert_to_ints(train_questions2, max_words)

In [10]:
split_frac = 0.8
split_idx = int(len(question_ints1)*split_frac)
train_q1, val_q1 = question_ints1[:split_idx], question_ints1[split_idx:]
train_q2, val_q2 = question_ints2[:split_idx], question_ints2[split_idx:]
train_y, val_y = train_data_labels[:split_idx], train_data_labels[split_idx:]
train_y = np.array(train_y)
val_y = np.array(val_y)

test_idx = int(len(val_q1)*0.5)
val_q1, test_q1 = val_q1[:test_idx], val_q1[test_idx:]
val_q2, test_q2 = val_q2[:test_idx], val_q2[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_q1.shape), 
      "\nValidation set: \t{}".format(val_q1.shape),
      "\nTest set: \t\t{}".format(test_q1.shape))

			Feature Shapes:
Train set: 		(323432, 100) 
Validation set: 	(40429, 100) 
Test set: 		(40429, 100)


In [11]:
lstm_size = 256
lstm_layers = 1
batch_size = 512
learning_rate = 0.001
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 300 

In [12]:
# n_words1 = len(question1_vocab_to_int)
# n_words2 = len(question2_vocab_to_int)

# # Create the graph object
# graph = tf.Graph()
# # Add nodes to the graph
# with graph.as_default():
#     inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
#     keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [19]:
n_words1 = len(question1_vocab_to_int) + 1
n_words2 = len(question2_vocab_to_int) + 1

tf.reset_default_graph()

# Create the graph object
# graph = tf.Graph()
# Add nodes to the graph
with tf.variable_scope('questions', reuse=None):
    inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
    inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    

    embedding1 = tf.Variable(tf.random_uniform((n_words1, embed_size), -1, 1))
    embed1 = tf.nn.embedding_lookup(embedding1, inputs_1)
    
    # Your basic LSTM cell
    lstm1 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell1 = tf.contrib.rnn.MultiRNNCell([drop1] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state1 = cell1.zero_state(batch_size, tf.float32)

    outputs1, final_state1 = tf.nn.dynamic_rnn(cell1, embed1,
                                             initial_state=initial_state1)
    
with tf.variable_scope('questions', reuse=True):
    embedding2 = tf.Variable(tf.random_uniform((n_words2, embed_size), -1, 1))
    embed2 = tf.nn.embedding_lookup(embedding2, inputs_2)
    
    # Your basic LSTM cell
    lstm2 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell2 = tf.contrib.rnn.MultiRNNCell([drop2] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state2 = cell2.zero_state(batch_size, tf.float32)
    
    outputs2, final_state2 = tf.nn.dynamic_rnn(cell2, embed2,
                                             initial_state=initial_state2)

In [20]:
with tf.variable_scope('questions', reuse=False):
    outputs = tf.concat([outputs1[:, -1], outputs2[:,-1]], 1)
    final_state = tf.concat([final_state1, final_state2], 1)
    predictions = tf.contrib.layers.fully_connected(outputs, 1, activation_fn=tf.sigmoid)
    cost = tf.losses.log_loss(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [21]:
with tf.variable_scope('questions', reuse=False):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [22]:
def get_batches(x1, x2, y, batch_size=100):
    
    n_batches = len(x1)//batch_size
    x1, x2, y = x1[:n_batches*batch_size], x2[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x1), batch_size):
        yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], y[ii:ii+batch_size]

In [None]:
epochs = 1
t_vars = tf.trainable_variables()
saver = tf.train.Saver(var_list=t_vars)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state1 = sess.run(initial_state1)
        state2 = sess.run(initial_state2)
        
        for ii, (x1, x2, y) in enumerate(get_batches(train_q1, train_q2, train_y, batch_size), 1):
            feed = {inputs_1: x1,
                    inputs_2: x2,
                    labels_: y[:,None],
                    keep_prob: 0.5,
                    initial_state1: state1,
                    initial_state2: state2}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%50==0:
                val_acc = []
                val_state1 = sess.run(cell1.zero_state(batch_size, tf.float32))
                val_state2 = sess.run(cell2.zero_state(batch_size, tf.float32))
                for x1, x2, y in get_batches(val_q1, val_q2, val_y, batch_size):
                    feed = {inputs_1: x1,
                            inputs_2: x2,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state1: val_state1,
                            initial_state1: val_state2}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/duplicates.ckpt")

Epoch: 0/1 Iteration: 5 Train loss: 0.687
Epoch: 0/1 Iteration: 10 Train loss: 0.663
Epoch: 0/1 Iteration: 15 Train loss: 0.629
Epoch: 0/1 Iteration: 20 Train loss: 0.631
Epoch: 0/1 Iteration: 25 Train loss: 0.610
Epoch: 0/1 Iteration: 30 Train loss: 0.637
Epoch: 0/1 Iteration: 35 Train loss: 0.605
Epoch: 0/1 Iteration: 40 Train loss: 0.616
Epoch: 0/1 Iteration: 45 Train loss: 0.603
Epoch: 0/1 Iteration: 50 Train loss: 0.600
Val acc: 0.681
Epoch: 0/1 Iteration: 55 Train loss: 0.567
Epoch: 0/1 Iteration: 60 Train loss: 0.550
Epoch: 0/1 Iteration: 65 Train loss: 0.623
Epoch: 0/1 Iteration: 70 Train loss: 0.564
Epoch: 0/1 Iteration: 75 Train loss: 0.588
Epoch: 0/1 Iteration: 80 Train loss: 0.595
Epoch: 0/1 Iteration: 85 Train loss: 0.560
Epoch: 0/1 Iteration: 90 Train loss: 0.585
Epoch: 0/1 Iteration: 95 Train loss: 0.536
Epoch: 0/1 Iteration: 100 Train loss: 0.569
Val acc: 0.716
Epoch: 0/1 Iteration: 105 Train loss: 0.563
Epoch: 0/1 Iteration: 110 Train loss: 0.535
Epoch: 0/1 Iteration: 

In [None]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x1, x2, y) in enumerate(get_batches(test_q1, test_q2, test_y, batch_size), 1):
        feed = {inputs_1: x1,
                inputs_2: x2,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state1: val_state1,
                initial_state1: val_state2}
        batch_acc, test_state = sess.run([cost, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))