In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from string import punctuation

In [2]:
train = pd.read_csv('train.csv').fillna("Empty")

In [3]:
print(len(train))
train.head()

404290


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
question_words = train['question1'].apply(lambda x: x.split()[0])

In [5]:
value_counts = question_words.value_counts()
value_counts = value_counts[value_counts>500]
value_counts

What      137930
How        93409
Why        32808
Is         22485
Which      17468
Can        11605
I           7906
Who         7547
Do          6789
Where       6641
If          5961
What's      5638
Does        5090
Are         4542
When        3227
Should      3129
Will        2981
In          1967
My          1723
Did         1285
I'm         1165
Has         1100
Would       1053
Have         887
Was          783
Could        719
As           582
Name: question1, dtype: int64

In [6]:
sentence_lengths1 = train['question1'].apply(len)
sentence_lengths2 = train['question2'].apply(len)
print(len(sentence_lengths1))
print(len(sentence_lengths2))
sentence_lengths = sentence_lengths1.append(sentence_lengths2)
print("Max sentence length %.2f" % np.max(sentence_lengths))
print("Min sentence length %.2f" % np.min(sentence_lengths))
print("Average sentence length %.2f" % np.mean(sentence_lengths))
print("Median senctence length %.2f" % np.median(sentence_lengths))
print("Standard deviation %.2f" % np.std(sentence_lengths))
print("Mean plus 2 X STD %.2f" % (2 * np.std(sentence_lengths) + np.mean(sentence_lengths)))

max_words = 100
greater_than = sentence_lengths[sentence_lengths > max_words]
print("Percent greater than max_words %.2f" % (len(greater_than)/ len(sentence_lengths)))

404290
404290
Max sentence length 1169.00
Min sentence length 1.00
Average sentence length 59.82
Median senctence length 51.00
Standard deviation 31.96
Mean plus 2 X STD 123.75
Percent greater than max_words 0.10


# Train and Test Splits

In [7]:
# train_data, test_data = train_test_split(train, test_size=0.25)
train_questions1 = train['question1'].tolist()
train_questions2 = train['question2'].tolist()
train_data_labels = train['is_duplicate'].tolist()

In [8]:
def build_int_dict(questions, max_vocab):
    all_questions = ' '.join([c.lower().translate(str.maketrans("", "", punctuation)) for c in questions])
    words = all_questions.split()
    from collections import Counter
    vocab = Counter(words).most_common(max_vocab)
    vocab_to_int = {word[0]: ii for ii, word in enumerate(vocab, 1)}
    vocab_to_int[''] = len(vocab_to_int.keys()) + 1
    vocab_to_int['<UNK>'] = len(vocab_to_int.keys()) + 1
    
    return vocab_to_int

In [9]:
max_vocab = 80000
train_questions = train_questions1[:]
train_questions.extend(train_questions2)

vocab_to_int_dict = build_int_dict(train_questions, max_vocab)
print(len(vocab_to_int_dict.keys()))

80002


In [10]:
print(train_questions1[0:10])
print(len(vocab_to_int_dict))

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'How can I increase the speed of my internet connection while using a VPN?', 'Why am I mentally very lonely? How can I solve it?', 'Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?', 'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?', 'Should I buy tiago?', 'How can I be a good geologist?', 'When do you use シ instead of し?', 'Motorola (company): Can I hack my Charter Motorolla DCX3400?']
80002


In [13]:
def convert_to_ints(questions, seq_len, vocab_to_int):
    all_questions_ints = []
    for i, each in enumerate(questions):
        question_ints = []
        if i % 100000 == 0:
            print(i)
        for word in each.split():
            if word.lower().translate(str.maketrans("","",punctuation)) in vocab_to_int.keys():
                question_ints.append(vocab_to_int[word.lower().translate(str.maketrans("","",punctuation))])
            else:
                question_ints.append(vocab_to_int['<UNK>'])
        all_questions_ints.append(question_ints)
    
    print("Finished converting to ints. Now padding...")
    features = np.zeros((len(all_questions_ints), seq_len), dtype=int)
    for i, row in enumerate(all_questions_ints):
        features[i, -len(row):] = np.array(row)[:seq_len]
        
    return features

In [12]:
question_ints1 = convert_to_ints(train_questions1, max_words, vocab_to_int_dict)
question_ints2 = convert_to_ints(train_questions2, max_words, vocab_to_int_dict)

0
Finished converting to ints. Now padding...
0
Finished converting to ints. Now padding...


In [14]:
split_frac = 0.8
split_idx = int(len(question_ints1)*split_frac)
train_q1, val_q1 = question_ints1[:split_idx], question_ints1[split_idx:]
train_q2, val_q2 = question_ints2[:split_idx], question_ints2[split_idx:]
train_y, val_y = train_data_labels[:split_idx], train_data_labels[split_idx:]
train_y = np.array(train_y)
val_y = np.array(val_y)

test_idx = int(len(val_q1)*0.5)
val_q1, test_q1 = val_q1[:test_idx], val_q1[test_idx:]
val_q2, test_q2 = val_q2[:test_idx], val_q2[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_q1.shape), 
      "\nValidation set: \t{}".format(val_q1.shape),
      "\nTest set: \t\t{}".format(test_q1.shape))

			Feature Shapes:
Train set: 		(323432, 100) 
Validation set: 	(40429, 100) 
Test set: 		(40429, 100)


In [15]:
lstm_size = 512
lstm_layers = 3
batch_size = 256
learning_rate = 0.001
# Size of the embedding vectors (number of units in the embedding layer)
embed_size = 400 

In [16]:
# n_words1 = len(question1_vocab_to_int)
# n_words2 = len(question2_vocab_to_int)

# # Create the graph object
# graph = tf.Graph()
# # Add nodes to the graph
# with graph.as_default():
#     inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
#     labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
#     keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [17]:
n_words = len(vocab_to_int_dict) + 1

tf.reset_default_graph()

# Create the graph object
# graph = tf.Graph()
# Add nodes to the graph
with tf.variable_scope('questions', reuse=None):
    inputs_1 = tf.placeholder(tf.int32, [None, None], name='inputs')
    inputs_2 = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    

    embedding1 = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed1 = tf.nn.embedding_lookup(embedding1, inputs_1)
    
    # Your basic LSTM cell
    lstm1 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell1 = tf.contrib.rnn.MultiRNNCell([drop1] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state1 = cell1.zero_state(batch_size, tf.float32)

    outputs1, final_state1 = tf.nn.dynamic_rnn(cell1, embed1,
                                             initial_state=initial_state1)
    
with tf.variable_scope('questions', reuse=True):
    embedding2 = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed2 = tf.nn.embedding_lookup(embedding2, inputs_2)
    
    # Your basic LSTM cell
    lstm2 = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # Add dropout to the cell
    drop2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    cell2 = tf.contrib.rnn.MultiRNNCell([drop2] * lstm_layers)
    
    # Getting an initial state of all zeros
    initial_state2 = cell2.zero_state(batch_size, tf.float32)
    
    outputs2, final_state2 = tf.nn.dynamic_rnn(cell2, embed2,
                                             initial_state=initial_state2)

In [18]:
with tf.variable_scope('questions', reuse=False):
    outputs = tf.concat([outputs1[:, -1], outputs2[:,-1]], 1)
    final_state = tf.concat([final_state1, final_state2], 1)
    predictions = tf.contrib.layers.fully_connected(outputs, 1, activation_fn=tf.sigmoid)
    cost = tf.losses.log_loss(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [19]:
with tf.variable_scope('questions', reuse=False):
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [20]:
def get_batches(x1, x2, y, batch_size=100):
    if y != None:
        n_batches = len(x1)//batch_size
        x1, x2, y = x1[:n_batches*batch_size], x2[:n_batches*batch_size], y[:n_batches*batch_size]
        for ii in range(0, len(x1), batch_size):
            yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], y[ii:ii+batch_size]
    else:
        n_batches = len(x1)//batch_size
        x1, x2, y = x1[:n_batches*batch_size], x2[:n_batches*batch_size], y[:n_batches*batch_size]
        for ii in range(0, len(x1), batch_size):
            yield x1[ii:ii+batch_size], x2[ii:ii+batch_size], None

In [None]:
epochs = 3
t_vars = tf.trainable_variables()
saver = tf.train.Saver(var_list=t_vars)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state1 = sess.run(initial_state1)
        state2 = sess.run(initial_state2)
        
        for ii, (x1, x2, y) in enumerate(get_batches(train_q1, train_q2, train_y, batch_size), 1):
            feed = {inputs_1: x1,
                    inputs_2: x2,
                    labels_: y[:,None],
                    keep_prob: 0.5,
                    initial_state1: state1,
                    initial_state2: state2}
            loss, state1, state2,_ = sess.run([cost, final_state1, final_state2, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e + 1, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%50==0:
                val_acc = []
                val_state1 = sess.run(cell1.zero_state(batch_size, tf.float32))
                val_state2 = sess.run(cell2.zero_state(batch_size, tf.float32))
                for x1, x2, y in get_batches(val_q1, val_q2, val_y, batch_size):
                    feed = {inputs_1: x1,
                            inputs_2: x2,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state1: val_state1,
                            initial_state2: val_state2}
                    batch_acc, val_state1, val_state2 = sess.run([cost, final_state1, final_state2], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/duplicates2.ckpt")

  from ipykernel import kernelapp as app


In [None]:
test_acc = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state1 = sess.run(initial_state1)
    test_state2 = sess.run(initial_state2)
    for ii, (x1, x2, y) in enumerate(get_batches(test_q1, test_q2, test_y, batch_size), 1):
        feed = {inputs_1: x1,
                inputs_2: x2,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state1: test_state1,
                initial_state1: test_state2}
        batch_acc, test_state1, test_state2 = sess.run([cost, final_state1, final_state2], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

In [128]:
test_set = pd.read_csv('test.csv').fillna("Empty")
print(len(test_set))

2345796


In [129]:
test_questions1 = test_set['question1'].tolist()
test_questions2 = test_set['question2'].tolist()
test_question_ints1 = convert_to_ints(test_questions1, max_words, vocab_to_int_dict)
test_question_ints2 = convert_to_ints(test_questions2, max_words, vocab_to_int_dict)

In [130]:
# Make predictions

predictions = []
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state1 = sess.run(initial_state1)
    test_state2 = sess.run(initial_state2)
    for ii, (x1, x2, _) in enumerate(get_batches(test_question_ints1, test_question_ints2, None, batch_size), 1):
        feed = {inputs_1: x1,
                inputs_2: x2,
                keep_prob: 1}
        prediction = sess.run([predictions], feed_dict=feed)
        predictions.append(prediction)

TypeError: 'NoneType' object is not subscriptable