In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import gensim
from tqdm import tqdm
import time
import re
import distance
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
stop_words = stopwords.words('english')

In [2]:
train_data = pd.read_csv("data/preprocessed_train.csv")
#test_data = pd.read_csv("data/preprocessed_test.csv")

In [3]:
print(train_data.head())

                                     question1_final  \
0  what is the step by step guide to invest in sh...   
1   what is the story of kohinoor koh i noor diamond   
2  how can i increase the speed of my internet co...   
3   why am i mentally very lonely how can i solve it   
4  which one dissolve in water quikly sugar salt ...   

                                     question2_final  
0  what is the step by step guide to invest in sh...  
1  what would happen if the indian government ste...  
2  how can internet speed be increase by hack thr...  
3  find the remainder when math 23 24 math is div...  
4             which fish would survive in salt water  


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404287 entries, 0 to 404286
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   question1_final  404274 non-null  object
 1   question2_final  404280 non-null  object
dtypes: object(2)
memory usage: 6.2+ MB


## Word Mover's Distance

In [5]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

## Normalized Word Mover's Distance

In [6]:
def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)

## Generates vectors for each sentence

In [7]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

## Feature Extraction

In [8]:
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [9]:
train_data['wmd'] = train_data.apply(lambda x: wmd(x['question1_final'], x['question2_final']), axis=1)
train_data

Unnamed: 0,question1_final,question2_final,wmd
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.198042
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government ste...,0.879870
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0.587479
3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,1.274354
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.972994
...,...,...,...
404282,how many keywords are there in the racket prog...,how many keywords are there in perl program la...,0.194770
404283,do you believe there is life after death,is it true that there is life after death,0.393355
404284,what is one coin,what is this coin,0.627237
404285,what is the approx annual cost of live while s...,i am have little hairfall problem but i want t...,1.304485


In [8]:
norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
norm_model.init_sims(replace=True)
train_data['norm_wmd'] = train_data.apply(lambda x: norm_wmd(x['question1_final'], x['question2_final']), axis=1)
train_data

  norm_model.init_sims(replace=True)


Unnamed: 0,question1_final,question2_final,norm_wmd
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.198042
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government ste...,0.879870
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0.587479
3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,1.274354
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.972994
...,...,...,...
404282,how many keywords are there in the racket prog...,how many keywords are there in perl program la...,0.194770
404283,do you believe there is life after death,is it true that there is life after death,0.393355
404284,what is one coin,what is this coin,0.627237
404285,what is the approx annual cost of live while s...,i am have little hairfall problem but i want t...,1.304485


In [11]:
question1_train_vectors = np.zeros((train_data.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(train_data.question1_final.values)):
    question1_train_vectors[i, :] = sent2vec(q)
    
question2_train_vectors  = np.zeros((train_data.shape[0], 300))
for i, q in tqdm(enumerate(train_data.question2_final.values)):
    question2_train_vectors[i, :] = sent2vec(q)

  return v / np.sqrt((v ** 2).sum())
404287it [01:06, 6050.36it/s]
404287it [01:08, 5878.38it/s]


In [12]:
def feature_set3(data, question1_vectors, question2_vectors):
    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]

    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

    return data

In [13]:
train_data = feature_set3(train_data, question1_train_vectors, question2_train_vectors)
train_data

  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()


Unnamed: 0,question1_final,question2_final,wmd,norm_wmd,cosine_distance,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.198042,0.198042,0.068972,5.081614,1.0,94.023324,0.371408,0.168999,0.186557,0.031817,-0.091902,0.050416,0.337301
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government ste...,0.879870,0.879870,0.308687,11.028160,1.0,158.811137,0.785732,0.353440,0.437108,0.004801,0.163020,0.185670,0.112646
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0.587479,0.587479,0.202094,8.829532,1.0,131.681688,0.635757,0.286115,0.329345,0.226704,0.060504,0.089026,-0.465365
3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,1.274354,1.274354,0.662905,16.075622,1.0,195.966982,1.151438,0.513992,0.708427,-0.002527,0.009567,-0.244560,0.074111
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0.972994,0.972994,0.369993,12.103178,1.0,161.408435,0.860225,0.382770,0.480633,-0.133849,0.114777,0.217900,-0.338876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,how many keywords are there in the racket prog...,how many keywords are there in perl program la...,0.194770,0.194770,0.151803,7.691880,1.0,124.332006,0.551004,0.248370,0.286366,0.061066,0.168470,-0.065888,-0.432547
404283,do you believe there is life after death,is it true that there is life after death,0.393355,0.393355,0.129039,7.146889,1.0,119.485637,0.508014,0.226945,0.263738,-0.002918,0.038642,-0.147208,-0.260159
404284,what is one coin,what is this coin,0.627237,0.627237,0.069016,5.065351,1.0,91.936365,0.371527,0.170819,0.182876,-0.193922,-0.147340,-0.279527,-0.397618
404285,what is the approx annual cost of live while s...,i am have little hairfall problem but i want t...,1.304485,1.304485,0.707867,16.378793,1.0,190.421741,1.189846,0.538765,0.730613,-0.012639,-0.068610,-0.319382,-0.247332


In [14]:
fs2_train = pd.read_csv("data/train_set2_features.csv")

In [15]:
fs2_train

Unnamed: 0,q1_char_num,q2_char_num,q1_word_num,q2_word_num,total_word_num,differ_word_num,same_first_word,same_last_word,total_unique_word_num,total_unique_word_withoutstopword_num,...,common_word_withoutstopword_ratio_max,fuzz_ratio,fuzz_QRatio,fuzz_WRatio,fuzz_partial_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,longest_substr_ratio
0,65,56,14,12,26,2,1,0,12,6,...,0.833333,93,93,95,100,100,93,100,89,0.982456
1,48,85,10,15,25,5,1,0,17,11,...,0.400000,63,63,86,73,86,62,100,73,0.571429
2,70,54,14,10,24,4,1,0,19,8,...,0.500000,52,52,69,56,73,65,100,69,0.181818
3,48,58,11,13,24,2,0,0,20,9,...,0.000000,36,36,36,40,37,36,37,38,0.040816
4,73,38,13,7,20,6,1,0,16,13,...,0.200000,45,45,86,55,67,47,100,63,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,80,74,14,13,27,1,1,1,14,8,...,0.857143,91,91,91,86,96,87,100,82,0.413333
404283,40,41,8,9,17,1,0,1,11,4,...,0.666667,72,72,75,72,79,69,100,70,0.634146
404284,16,17,4,4,8,0,1,1,5,2,...,0.500000,79,79,82,75,86,79,100,75,0.470588
404285,87,120,17,25,42,8,0,0,39,22,...,0.000000,42,42,45,46,46,47,100,45,0.056818


In [16]:
fs2_train.shape

(404287, 28)

In [17]:
train_data.drop(["question1_final", "question2_final"], axis=1, inplace=True)

In [18]:
train_data.shape

(404287, 13)

In [19]:
train_features = pd.concat([fs2_train, train_data], axis=1)

In [20]:
train_features

Unnamed: 0,q1_char_num,q2_char_num,q1_word_num,q2_word_num,total_word_num,differ_word_num,same_first_word,same_last_word,total_unique_word_num,total_unique_word_withoutstopword_num,...,cityblock_distance,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,65,56,14,12,26,2,1,0,12,6,...,5.081614,1.0,94.023324,0.371408,0.168999,0.186557,0.031817,-0.091902,0.050416,0.337301
1,48,85,10,15,25,5,1,0,17,11,...,11.028160,1.0,158.811137,0.785732,0.353440,0.437108,0.004801,0.163020,0.185670,0.112646
2,70,54,14,10,24,4,1,0,19,8,...,8.829532,1.0,131.681688,0.635757,0.286115,0.329345,0.226704,0.060504,0.089026,-0.465365
3,48,58,11,13,24,2,0,0,20,9,...,16.075622,1.0,195.966982,1.151438,0.513992,0.708427,-0.002527,0.009567,-0.244560,0.074111
4,73,38,13,7,20,6,1,0,16,13,...,12.103178,1.0,161.408435,0.860225,0.382770,0.480633,-0.133849,0.114777,0.217900,-0.338876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,80,74,14,13,27,1,1,1,14,8,...,7.691880,1.0,124.332006,0.551004,0.248370,0.286366,0.061066,0.168470,-0.065888,-0.432547
404283,40,41,8,9,17,1,0,1,11,4,...,7.146889,1.0,119.485637,0.508014,0.226945,0.263738,-0.002918,0.038642,-0.147208,-0.260159
404284,16,17,4,4,8,0,1,1,5,2,...,5.065351,1.0,91.936365,0.371527,0.170819,0.182876,-0.193922,-0.147340,-0.279527,-0.397618
404285,87,120,17,25,42,8,0,0,39,22,...,16.378793,1.0,190.421741,1.189846,0.538765,0.730613,-0.012639,-0.068610,-0.319382,-0.247332


In [21]:
train_features.shape

(404287, 41)

In [22]:
train_features.to_csv("data/train_set3_features.csv", index=False)

In [23]:
print(train_features.columns)

Index(['q1_char_num', 'q2_char_num', 'q1_word_num', 'q2_word_num',
       'total_word_num', 'differ_word_num', 'same_first_word',
       'same_last_word', 'total_unique_word_num',
       'total_unique_word_withoutstopword_num', 'total_unique_word_num_ratio',
       'common_word_num', 'common_word_ratio', 'common_word_ratio_min',
       'common_word_ratio_max', 'common_word_withoutstopword_num',
       'common_word_withoutstopword_ratio',
       'common_word_withoutstopword_ratio_min',
       'common_word_withoutstopword_ratio_max', 'fuzz_ratio', 'fuzz_QRatio',
       'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_token_set_ratio',
       'fuzz_token_sort_ratio', 'fuzz_partial_token_set_ratio',
       'fuzz_partial_token_sort_ratio', 'longest_substr_ratio', 'wmd',
       'norm_wmd', 'cosine_distance', 'cityblock_distance', 'jaccard_distance',
       'canberra_distance', 'euclidean_distance', 'minkowski_distance',
       'braycurtis_distance', 'skew_q1vec', 'skew_q2vec', 'kur_q1vec',
       

### Saving the word2vec vector features

In [25]:
import pickle
pickle.dump(question1_train_vectors, open('data/q1_word2vec.pkl', 'wb'), -1)
pickle.dump(question2_train_vectors, open('data/q2_word2vec.pkl', 'wb'), -1)