In [1]:
%matplotlib inline

import sys, os
sys.path.append("../data/glove/")

import numpy as np
import matplotlib.pyplot as plt
import csv
import string
from scipy import spatial
import sklearn as sk
from sklearn import linear_model

from glove import loadWordVectors

In [2]:
num_dim = 200
train_size = 323482
test_size = 80870

In [3]:
tokens = {}
header = []
count = 0
with open('../data/quora_data/quora_duplicate_questions.tsv') as f, open('../data/quora_data/train.tsv', 'w') as g, open('../data/quora_data/test.tsv', 'w') as h:
    reader = csv.reader(f, delimiter='\t')
    train_writer = csv.writer(g, delimiter='\t')
    test_writer = csv.writer(h, delimiter='\t')
    
    index = 0
    isHeader = True
    for line in reader:
        if isHeader:
            header = line
            isHeader = False
            count += 1
            continue
            
        if count <= train_size:
            train_writer.writerow(line)
        else:
            test_writer.writerow(line)
            
        sent1 = line[3].translate(None, string.punctuation).lower().split()
        sent2 = line[4].translate(None, string.punctuation).lower().split()
            
#         sent1 = str.translate(line[3], str.maketrans('', '', string.punctuation)).lower().split()
#         sent2 = str.translate(line[4], str.maketrans('', '', string.punctuation)).lower().split()            
        for word in sent1 + sent2:
            if word not in tokens:
                tokens[word] = index
                index += 1
        
        count += 1
            
    tokens["UNK"] = index

In [4]:
word_vectors = loadWordVectors(tokens)
print(len(tokens))
print(len(word_vectors))

111682
111682


In [5]:
word_vectors

array([[ 0.39396  ,  0.44185  , -0.0042279, ...,  0.47576  ,  0.20978  ,
        -0.11687  ],
       [ 0.32928  ,  0.25526  ,  0.26753  , ...,  0.074621 ,  0.012001 ,
        -0.21952  ],
       [-0.071549 ,  0.093459 ,  0.023738 , ...,  0.33617  ,  0.030591 ,
         0.25577  ],
       ..., 
       [ 0.91682  , -0.36737  , -0.32286  , ..., -0.3297   , -0.66926  ,
        -0.75765  ],
       [ 0.39356  ,  0.18569  ,  0.011526 , ...,  0.4215   ,  0.087896 ,
         1.094    ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ]])

In [6]:
# zero_words = np.where(np.sum(word_vectors, axis=1) == 0)[0]
# for i in range(len(zero_words)):
#     print(word_vectors[i, :])
# word_vectors

In [7]:
def sentence_to_vec(sentence, word_vectors):
    vec_sum = np.zeros(num_dim)
    count = .1
    for word in sentence:
        if word in tokens:
            vec = word_vectors[tokens[word], :]
            if np.sum(vec) != 0:
                vec_sum += vec
                count += 1
                
    if np.sum(vec_sum) == 0:
        vec_sum = np.random.rand(num_dim)
                
    return vec_sum / count

In [8]:
def cosine_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return spatial.distance.cosine(vec1, vec2)

In [9]:
def euclidean_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.linalg.norm(vec1 - vec2)

In [10]:
def manhattan_distance(sentence1, sentence2, word_vectors):
    vec1 = sentence_to_vec(sentence1, word_vectors)
    vec2 = sentence_to_vec(sentence2, word_vectors)
    
    return np.sum(np.abs(vec1 - vec2))

In [None]:
X_train = np.zeros([train_size, 1])
Y_train = np.zeros([train_size, 1])

X_test = np.zeros([test_size, 1])
Y_test = np.zeros([test_size, 1])

index = 0
with open('../data/quora_data/train.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    
    for line in reader:
        sent1 = line[3].translate(None, string.punctuation).lower().split()
        sent2 = line[4].translate(None, string.punctuation).lower().split()
#         sent1 = str.translate(line[3], str.maketrans('', '', string.punctuation)).lower().split()
#         sent2 = str.translate(line[4], str.maketrans('', '', string.punctuation)).lower().split()            
        dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
        X_train[index] = dist
        Y_train[index] = int(line[5])
        
        if index % 10000 == 0:
            print(index)
            
        index += 1

index = 0
with open('../data/quora_data/test.tsv') as f:
    reader = csv.reader(f, delimiter='\t')
    
    for line in reader:
        sent1 = line[3].translate(None, string.punctuation).lower().split()
        sent2 = line[4].translate(None, string.punctuation).lower().split()
#         sent1 = str.translate(line[3], str.maketrans('', '', string.punctuation)).lower().split()
#         sent2 = str.translate(line[4], str.maketrans('', '', string.punctuation)).lower().split()            
        dist = cosine_distance(sent1, sent2, word_vectors)
#         dist = manhattan_distance(sent1, sent2, word_vectors)
        
        X_test[index] = dist
        Y_test[index] = int(line[5])
        
        if index % 10000 == 0:
            print(index)
        
        index += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000


In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

In [None]:
preds = [0.0 if x < 0.5 else 1.0 for x in regr.predict(X_test)]
preds = np.array(preds)

In [None]:
num_correct = 0
num_wrong = 0
for i in range(len(preds)):
    if preds[i] == Y_test[i]:
        num_correct += 1
    else:
        num_wrong += 1

In [None]:
acc = float(num_correct) / (num_correct + num_wrong)
acc * 100

In [None]:
zero_preds = np.zeros(len(Y_test))
zero_preds[:10]

In [None]:
num_correct_z = 0
num_wrong_z = 0
for i in range(len(preds)):
    if zero_preds[i] == Y_test[i]:
        num_correct_z += 1
    else:
        num_wrong_z += 1

In [None]:
float(num_correct_z) / (num_correct_z + num_wrong_z)