In [5]:
import pandas as pd
import numpy as np
import gensim
from pyemd import emd
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from nltk import word_tokenize
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score



In [4]:
pip install pyemd

Processing /home/jovyan/.cache/pip/wheels/f9/f0/23/aefbdde40e915c67830ebecb55be2344a8b6e95fe3ce3ccf96/pyemd-0.5.1-cp36-cp36m-linux_x86_64.whl
Installing collected packages: pyemd
Successfully installed pyemd-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
df = pd.read_csv('train.csv')

In [7]:
def wmd(q1, q2):
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    stop_words = stopwords.words('english')
    q1 = [w for w in q1 if w not in stop_words]
    q2 = [w for w in q2 if w not in stop_words]
    return model.wmdistance(q1, q2)

In [8]:
def norm_wmd(q1, q2):
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    stop_words = stopwords.words('english')
    q1 = [w for w in q1 if w not in stop_words]
    q2 = [w for w in q2 if w not in stop_words]
    return norm_model.wmdistance(q1, q2)

In [9]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [10]:
# BASIC FEATURES
df['len_q1'] = df.question1.apply(lambda x: len(str(x)))
df['len_q2'] = df.question2.apply(lambda x: len(str(x)))
df['diff_len'] = df.len_q1 - df.len_q2
df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split()))
df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split()))
df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
# FUZZY FEATURES
df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [11]:
df.head(2)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzz_ratio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,66,57,9,20,20,14,12,10,93,98,100,88,100,93
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,51,88,-37,21,29,8,13,4,65,73,100,73,86,63


In [12]:
# word2vec Modeling (WORD2VEC FEATURES)
model = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
df['wmd'] = df.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [13]:
# Normalized Word2vec Modeling
norm_model = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
norm_model.init_sims(replace=True)
df['norm_wmd'] = df.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

In [14]:
question1_vectors = np.zeros((df.shape[0], 300))

for i, q in enumerate(tqdm_notebook(df.question1.values)):
    question1_vectors[i, :] = sent2vec(q)
    
question2_vectors  = np.zeros((df.shape[0], 300))
for i, q in enumerate(tqdm_notebook(df.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=404290.0), HTML(value='')))

  





Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=404290.0), HTML(value='')))




In [15]:
# DISTANCE MEASURES (WORD2VEC FEATURES)
df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


In [17]:
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [18]:
df.isnull().sum()

id                                  0
qid1                                0
qid2                                0
question1                           1
question2                           2
is_duplicate                        0
len_q1                              0
len_q2                              0
diff_len                            0
len_char_q1                         0
len_char_q2                         0
len_word_q1                         0
len_word_q2                         0
common_words                        0
fuzz_ratio                          0
fuzz_partial_ratio                  0
fuzz_partial_token_set_ratio        0
fuzz_partial_token_sort_ratio       0
fuzz_token_set_ratio                0
fuzz_token_sort_ratio               0
wmd                                 0
norm_wmd                            0
cosine_distance                  1775
cityblock_distance                  0
jaccard_distance                    0
canberra_distance                   0
euclidean_di

In [19]:
df.drop(['question1', 'question2'], axis=1, inplace=True)
df = df[pd.notnull(df['cosine_distance'])]
df = df[pd.notnull(df['jaccard_distance'])]

In [20]:
print(df)

            id    qid1    qid2  is_duplicate  len_q1  len_q2  diff_len  \
0            0       1       2             0      66      57         9   
1            1       3       4             0      51      88       -37   
2            2       5       6             0      73      59        14   
3            3       7       8             0      50      65       -15   
4            4       9      10             0      76      39        37   
...        ...     ...     ...           ...     ...     ...       ...   
404285  404285  433578  379845             0      85      79         6   
404286  404286   18840  155606             1      41      42        -1   
404287  404287  537928  537929             0      17      17         0   
404288  404288  537930  537931             0      94     127       -33   
404289  404289  537932  537933             0      37      45        -8   

        len_char_q1  len_char_q2  len_word_q1  ...  fuzz_token_sort_ratio  \
0                20           20  

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

In [22]:
X = df.loc[:, df.columns != 'is_duplicate']
y = df.loc[:, df.columns == 'is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [23]:
# TRAIN XGBOOST MODEL & PREDICT
import xgboost as xgb

model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
prediction = model.predict(X_test)
cm = confusion_matrix(y_test, prediction)  
print(cm)  
print('Accuracy', accuracy_score(y_test, prediction))
print(classification_report(y_test, prediction))

Import requested from: 'numba.numpy_support', please update to use 'numba.np.numpy_support' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba import cuda, numpy_support
Import requested from: 'numba.utils', please update to use 'numba.core.utils' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.[0m
  from numba.utils import exec_, pysignature


[[10972  1734]
 [ 1984  5525]]
Accuracy 0.8160771704180064
              precision    recall  f1-score   support

           0       0.85      0.86      0.86     12706
           1       0.76      0.74      0.75      7509

    accuracy                           0.82     20215
   macro avg       0.80      0.80      0.80     20215
weighted avg       0.82      0.82      0.82     20215



In [29]:
print(y_train)

        is_duplicate
284713             0
66021              0
192943             0
387513             0
119525             1
...              ...
359783             0
358083             0
152315             1
117952             1
305711             0

[384075 rows x 1 columns]
