In [1]:
import pandas as pd
import numpy as np

dtypes ={
    'id': np.uint32,
    'qid1': np.uint32,
    'qid2': np.uint32,
    'question1': np.str,
    'question2': np.str,
    'is_duplicate': np.uint8
}

df = pd.read_csv('../dataset/quora_train.csv', dtype=dtypes)
df_test = pd.read_csv('../dataset/quora_test.csv', dtype=dtypes, iterator=True, chunksize=10000)

In [2]:
df.describe()

Unnamed: 0,id,qid1,qid2,is_duplicate
count,404290.0,404290.0,404290.0,404290.0
mean,202144.5,217243.942418,220955.655337,0.369198
std,116708.614503,157751.700002,159903.182629,0.482588
min,0.0,1.0,2.0,0.0
25%,101072.25,74437.5,74727.0,0.0
50%,202144.5,192182.0,197052.0,0.0
75%,303216.75,346573.5,354692.5,1.0
max,404289.0,537932.0,537933.0,1.0


In [3]:
import spacy

nlp = spacy.load('en')

In [7]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('../dataset/wiki.en.vec')

In [23]:
def compare(q1, q2):
    q1 = q1 if type(q1) is str else ''
    q2 = q2 if type(q2) is str else ''
    
    qt1 = [t for t in nlp(q1) if t.tag_.startswith('N')]# or t.tag_.startswith('V') or t.tag_.startswith('J')]
    qt2 = [t for t in nlp(q2) if t.tag_.startswith('N')]# or t.tag_.startswith('V') or t.tag_.startswith('J')]
    
    count = 0
    s_sum = 0
    for t1 in qt1:
        for t2 in qt2:
            if t1.tag_[0] == t2.tag_[0]:
                try: 
                    s_sum += model.similarity(t1.text.lower(), t2.text.lower())
                    count += 1
                except KeyError:
                    continue                      
    
    return s_sum / count if count != 0 else 0

In [24]:
from multiprocessing import Pool

pool = Pool(4)

df['compare'] = pool.starmap(compare, tuple(zip(df.question1, df.question2)))

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(n_jobs=4)
clf.fit(df.compare.values.reshape(-1, 1), df.is_duplicate.values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=4,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
from tqdm import tqdm

output = open('result.csv', 'w')
output.write('test_id,is_duplicate\n')

for chunk in tqdm(df_test):
    chunk['compare'] = pool.starmap(compare, tuple(zip(chunk.question1, chunk.question2)))
    prediction = clf.predict_proba(chunk['compare'].reshape(-1, 1))
    
    for i, p in zip(chunk['test_id'], prediction):
        output.write('{},{}\n'.format(i, p[1]))

228it [14:42,  3.75s/it]

In [None]:
output.close()

### Result: 0.53113