In [1]:
%matplotlib inline

import sys, os
sys.path.append("../data/glove/")

import numpy as np
import matplotlib.pyplot as plt
import csv
import string
from scipy import spatial
import sklearn as sk
from sklearn import linear_model
import re
import time
import pickle

from glove import loadWordVectors

In [2]:
num_dim = 200
train_size = 323482
test_size = 80870

In [3]:
tok2id = {}
header = []
count = 0
max_length = 0
with open('../data/quora/quora_duplicate_questions.tsv') as f, open('../data/quora/train.tsv', 'w') as g, open('../data/quora/test.tsv', 'w') as h:
    reader = csv.reader(f, delimiter='\t')
    train_writer = csv.writer(g, delimiter='\t')
    test_writer = csv.writer(h, delimiter='\t')
    
    index = 0
    isHeader = True
    for line in reader:
        if index <= 1:
            print(line)
        
        if isHeader:
            header = line
            isHeader = False
            count += 1
            continue
        
        sent1 = re.findall(r"\w+(?=n't)|n't|\w+(?=')|'\w+|\w+|[.,!?;]", line[3].lower())
        sent2 = re.findall(r"\w+(?=n't)|n't|\w+(?=')|'\w+|\w+|[.,!?;]", line[4].lower())
        max_length = max([len(sent1), len(sent2), max_length])
                        
        for word in sent1 + sent2:
            if word not in tok2id:
                tok2id[word] = index
                index += 1
        
        
        if count <= train_size:
            train_writer.writerow(sent1)
            train_writer.writerow(sent2)
            train_writer.writerow(line[5])
        else:
            test_writer.writerow(sent1)
            test_writer.writerow(sent2)
            test_writer.writerow(line[5])
        
        count += 1
            
    tok2id['UNK'] = index

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
['0', '1', '2', 'What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?', '0']


In [4]:
path = 'dependencies'
with open(os.path.join(path, "features.pkl"), "wb") as f:
    pickle.dump([tok2id, max_length], f, protocol=2)

In [5]:
word_vectors = loadWordVectors(tok2id)
print(len(tok2id))
print(len(word_vectors))

88015
88015


In [6]:
word_vectors

array([[ 0.39396  ,  0.44185  , -0.0042279, ...,  0.47576  ,  0.20978  ,
        -0.11687  ],
       [ 0.32928  ,  0.25526  ,  0.26753  , ...,  0.074621 ,  0.012001 ,
        -0.21952  ],
       [-0.071549 ,  0.093459 ,  0.023738 , ...,  0.33617  ,  0.030591 ,
         0.25577  ],
       ..., 
       [ 0.91682  , -0.36737  , -0.32286  , ..., -0.3297   , -0.66926  ,
        -0.75765  ],
       [ 0.39356  ,  0.18569  ,  0.011526 , ...,  0.4215   ,  0.087896 ,
         1.094    ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ]])

In [7]:
def sentence_to_vec(sentence, word_vectors):
    vec_sum = np.zeros(num_dim)
    count = .1
    for word in sentence:
        if word in tok2id:
            vec = word_vectors[tok2id[word], :]
            if np.sum(vec) != 0:
                vec_sum += vec
                count += 1
                
    if np.sum(vec_sum) == 0:
        vec_sum = np.random.rand(num_dim)
                
    return vec_sum / count

In [8]:
def cosine_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return spatial.distance.cosine(vec1, vec2)

In [None]:
def euclidean_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.linalg.norm(vec1 - vec2)

In [None]:
def manhattan_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.sum(np.abs(vec1 - vec2))

In [None]:
X_train = np.zeros([train_size, 1])
Y_train = np.zeros([train_size, 1])

X_test = np.zeros([test_size, 1])
Y_test = np.zeros([test_size, 1])

index = 0
with open('../data/quora/train.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    sent1, sent2 = [], []
    line_num = 0
    for line in reader:
        if line_num % 3 == 0:
            sent1 = list(line)
        elif line_num % 3 == 1:
            sent2 = list(line)
        else:
            dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
            X_train[index] = dist
            Y_train[index] = int(line[0])
            index += 1
        
            if index % 10000 == 0:
                print(index)
            
        line_num += 1

In [None]:
index = 0
with open('../data/quora/test.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    sent1, sent2 = [], []
    line_num = 0
    
    for line in reader:
        if line_num % 3 == 0:
            sent1 = list(line)
        elif line_num % 3 == 1:
            sent2 = list(line)
        else:
            dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
            X_test[index] = dist
            Y_test[index] = int(line[0])
            index += 1
        
            if index % 10000 == 0:
                print(index)

        line_num += 1

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

In [None]:
preds = [0.0 if x < 0.5 else 1.0 for x in regr.predict(X_test)]
preds = np.array(preds)

In [None]:
num_correct = 0
num_wrong = 0
for i in range(len(preds)):
    if preds[i] == Y_test[i]:
        num_correct += 1
    else:
        num_wrong += 1

In [None]:
acc = float(num_correct) / (num_correct + num_wrong)
acc * 100

In [None]:
zero_preds = np.zeros(len(Y_test))
zero_preds[:10]

In [None]:
num_correct_z = 0
num_wrong_z = 0
for i in range(len(preds)):
    if zero_preds[i] == Y_test[i]:
        num_correct_z += 1
    else:
        num_wrong_z += 1

In [None]:
float(num_correct_z) / (num_correct_z + num_wrong_z)

In [None]:
# Script for creating train/dev/test set splits
# - test set: use same example indices as the IBM Paper
# - dev set: use 15% of remaining indices
# - train set: everything else

from __future__ import print_function
import csv
import numpy as np

IBM_TEST_TSV_PATH = "../data/quora/test_ibm.tsv"
NUM_DATA = 404351

# get test indices from IBM paper
with open(IBM_TEST_TSV_PATH, 'r') as input_file:
    reader = csv.reader(input_file, delimiter='\t')
    test_indices = []
    for line in reader:
        test_indices.append(int(line[3]))
    test_indices = np.array(test_indices)
    print("Num test indices: %d" % len(test_indices))
# verify test_indices:
# with open("test_indices.npy", 'rb') as test_indices_file:
#     indices = np.load(test_indices_file)
#     print(indices)

# now get train/dev split
all_indices = range(NUM_DATA)
non_test_indices = np.array(list(set(all_indices) - set(test_indices)))
print("Num non-test indices: %d" % len(non_test_indices))
np.random.shuffle(non_test_indices)

DEV_SPLIT_FRACTION = 0.15
max_dev_index = int(0.15 * len(non_test_indices))
dev_indices = non_test_indices[0:max_dev_index]
train_indices = non_test_indices[max_dev_index:]

np.savez_compressed("data_split_indices.npz", train=train_indices, dev=dev_indices, test=test_indices)

Num test indices: 10000
Num non-test indices: 394351


In [2]:
# Script for creating the same train/dev/test set splits as the IBM Paper
from __future__ import print_function
import csv
import numpy as np

IBM_TEST_TSV_PATH = "../data/quora/test_ibm.tsv"
IBM_DEV_TSV_PATH = "../data/quora/dev_ibm.tsv"
NUM_DATA = 404351

# get test indices from IBM paper
with open(IBM_TEST_TSV_PATH, 'r') as input_file:
    reader = csv.reader(input_file, delimiter='\t')
    test_indices = []
    for line in reader:
        test_indices.append(int(line[3]))
    test_indices = np.array(test_indices)
    print("Num test indices: %d" % len(test_indices))
# verify test_indices:
# with open("test_indices.npy", 'rb') as test_indices_file:
#     indices = np.load(test_indices_file)
#     print(indices)

# get dev indices from IBM paper
with open(IBM_DEV_TSV_PATH, 'r') as input_file:
    reader = csv.reader(input_file, delimiter='\t')
    dev_indices = []
    for line in reader:
        dev_indices.append(int(line[3]))
    dev_indices = np.array(dev_indices)
    print("Num dev indices: %d" % len(dev_indices))

# now get train set
all_indices = range(NUM_DATA)
train_indices = np.array(list(set(all_indices) - set(test_indices) - set(dev_indices)))
print("Num train indices: %d" % len(train_indices))
np.random.shuffle(train_indices)

np.savez_compressed("data_split_indices_ibm.npz", train=train_indices, dev=dev_indices, test=test_indices)

Num test indices: 10000
Num dev indices: 10000
Num train indices: 384351
