In [1]:
%matplotlib inline

import sys, os
sys.path.append("../data/glove/")

import numpy as np
import matplotlib.pyplot as plt
import csv
import string
from scipy import spatial
import sklearn as sk
from sklearn import linear_model
import re
import time
import pickle

from glove import loadWordVectors

In [2]:
num_dim = 200
train_size = 323482
test_size = 80870

In [3]:
tok2id = {}
header = []
count = 0
max_length = 0
with open('../data/quora/quora_duplicate_questions.tsv') as f, open('../data/quora/train.tsv', 'w') as g, open('../data/quora/test.tsv', 'w') as h:
    reader = csv.reader(f, delimiter='\t')
    train_writer = csv.writer(g, delimiter='\t')
    test_writer = csv.writer(h, delimiter='\t')
    
    index = 0
    isHeader = True
    for line in reader:
        if index <= 1:
            print(line)
        
        if isHeader:
            header = line
            isHeader = False
            count += 1
            continue
        
        sent1 = re.findall(r"\w+(?=n't)|n't|\w+(?=')|'\w+|\w+|[.,!?;]", line[3].lower())
        sent2 = re.findall(r"\w+(?=n't)|n't|\w+(?=')|'\w+|\w+|[.,!?;]", line[4].lower())
        max_length = max([len(sent1), len(sent2), max_length])
                        
        for word in sent1 + sent2:
            if word not in tok2id:
                tok2id[word] = index
                index += 1
        
        
        if count <= train_size:
            train_writer.writerow(sent1)
            train_writer.writerow(sent2)
            train_writer.writerow(line[5])
        else:
            test_writer.writerow(sent1)
            test_writer.writerow(sent2)
            test_writer.writerow(line[5])
        
        count += 1
            
    tok2id['UNK'] = index

['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']
['0', '1', '2', 'What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?', '0']


In [4]:
path = 'dependencies'
with open(os.path.join(path, "features.pkl"), "wb") as f:
    pickle.dump([tok2id, max_length], f)

In [5]:
word_vectors = loadWordVectors(tok2id)
print(len(tok2id))
print(len(word_vectors))

88015
88015


In [6]:
def sentence_to_vec(sentence, word_vectors):
    vec_sum = np.zeros(num_dim)
    count = .1
    for word in sentence:
        if word in tok2id:
            vec = word_vectors[tok2id[word], :]
            if np.sum(vec) != 0:
                vec_sum += vec
                count += 1
                
    if np.sum(vec_sum) == 0:
        vec_sum = np.random.rand(num_dim)
                
    return vec_sum / count

In [7]:
def cosine_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return spatial.distance.cosine(vec1, vec2)

In [8]:
def euclidean_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.linalg.norm(vec1 - vec2)

In [9]:
def manhattan_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.sum(np.abs(vec1 - vec2))

In [12]:
X_train = np.zeros([train_size, 1])
Y_train = np.zeros([train_size, 1])

X_test = np.zeros([test_size, 1])
Y_test = np.zeros([test_size, 1])

index = 0
with open('../data/quora/train.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    sent1, sent2 = [], []
    
    for line in reader:
        if index % 3 == 0:
            sent1 = line
        elif index % 3 == 1:
            sent2 = line
        else:
            dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
            X_train[index] = dist
            Y_train[index] = int(line[0])
        
        if index % 10000 == 0:
            print(index)
            
        index += 1

index = 0
with open('../data/quora/test.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    
    sent1, sent2 = [], []
    
    for line in reader:
        if index % 3 == 0:
            sent1 = line
        elif index % 3 == 1:
            sent2 = line
        else:
            dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
            X_test[index] = dist
            Y_test[index] = int(line)
        
        if index % 10000 == 0:
            print(index)
            
        index += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000


IndexError: index 323483 is out of bounds for axis 0 with size 323482

In [12]:
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
preds = [0.0 if x < 0.5 else 1.0 for x in regr.predict(X_test)]
preds = np.array(preds)

In [14]:
num_correct = 0
num_wrong = 0
for i in range(len(preds)):
    if preds[i] == Y_test[i]:
        num_correct += 1
    else:
        num_wrong += 1

In [15]:
acc = float(num_correct) / (num_correct + num_wrong)
acc * 100

63.89637690119946

In [16]:
zero_preds = np.zeros(len(Y_test))
zero_preds[:10]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [17]:
num_correct_z = 0
num_wrong_z = 0
for i in range(len(preds)):
    if zero_preds[i] == Y_test[i]:
        num_correct_z += 1
    else:
        num_wrong_z += 1

In [18]:
float(num_correct_z) / (num_correct_z + num_wrong_z)

0.6402126870285644