In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import time
import re
import distance

In [2]:
train_data = pd.read_csv("data/preprocessed_train.csv")
test_data = pd.read_csv("data/preprocessed_test.csv")

In [3]:
print(train_data.head())

                                     question1_final  \
0  what is the step by step guide to invest in sh...   
1   what is the story of kohinoor koh i noor diamond   
2  how can i increase the speed of my internet co...   
3   why am i mentally very lonely how can i solve it   
4  which one dissolve in water quikly sugar salt ...   

                                     question2_final  
0  what is the step by step guide to invest in sh...  
1  what would happen if the indian government ste...  
2  how can internet speed be increase by hack thr...  
3  find the remainder when math 23 24 math is div...  
4             which fish would survive in salt water  


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404287 entries, 0 to 404286
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   question1_final  404274 non-null  object
 1   question2_final  404280 non-null  object
dtypes: object(2)
memory usage: 6.2+ MB


## Feature Extraction

In [4]:
def doesMatch (q, match):
    q1, q2 = q['question1_final'], q['question2_final']
    q1 = q1.split()
    q2 = q2.split()
    if len(q1)>0 and len(q2)>0 and q1[match]==q2[match]:
        return 1
    else:
        return 0

In [5]:
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

In [10]:
def feature_set2(data):
    data["fuzz_ratio"] = data.apply(lambda x: fuzz.ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data['fuzz_QRatio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data["fuzz_partial_ratio"] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data["fuzz_token_set_ratio"] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data["fuzz_token_sort_ratio"] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1_final']), str(x['question2_final'])), axis=1)
    data["longest_substr_ratio"]  = data.apply(lambda x: get_longest_substr_ratio(str(x["question1_final"]), str(x["question2_final"])), axis=1)
    return data

In [11]:
train_data = feature_set2(train_data)
train_data

Unnamed: 0,question1_final,question2_final,fuzz_ratio,fuzz_QRatio,fuzz_WRatio,fuzz_partial_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,longest_substr_ratio
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,93,93,95,100,100,93,100,89,0.982456
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government ste...,63,63,86,73,86,62,100,73,0.571429
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,52,52,69,56,73,65,100,69,0.181818
3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,36,36,36,40,37,36,37,38,0.040816
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,45,45,86,55,67,47,100,63,0.153846
...,...,...,...,...,...,...,...,...,...,...,...
404282,how many keywords are there in the racket prog...,how many keywords are there in perl program la...,91,91,91,86,96,87,100,82,0.413333
404283,do you believe there is life after death,is it true that there is life after death,72,72,75,72,79,69,100,70,0.634146
404284,what is one coin,what is this coin,79,79,82,75,86,79,100,75,0.470588
404285,what is the approx annual cost of live while s...,i am have little hairfall problem but i want t...,42,42,45,46,46,47,100,45,0.056818


In [12]:
test_data = feature_set2(test_data)
test_data.head()

Unnamed: 0,question1_final,question2_final,fuzz_ratio,fuzz_QRatio,fuzz_WRatio,fuzz_partial_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,longest_substr_ratio
0,how doe the surface pro himself 4 compare with...,why did microsoft choose core m3 and not core ...,44,44,55,45,58,56,100,60,0.25
1,should i have a hair transplant at age 24 how ...,how much cost doe hair transplant require,50,50,86,59,83,59,100,63,0.404762
2,what but is the best way to send money from ch...,what you send money to china,60,60,86,82,92,55,100,68,0.413793
3,which food not emulsifier,what food fibre,55,55,86,53,55,55,100,67,0.375
4,how aberystwyth start read,how their can i start read,65,65,66,65,70,62,100,62,0.407407


In [21]:
fs1_train = pd.read_csv("data/train_basic_features.csv")
fs1_test = pd.read_csv("data/test_basic_features.csv")

In [22]:
fs1_train

Unnamed: 0,question1_final,question2_final,q1_char_num,q2_char_num,q1_word_num,q2_word_num,total_word_num,differ_word_num,same_first_word,same_last_word,...,total_unique_word_withoutstopword_num,total_unique_word_num_ratio,common_word_num,common_word_ratio,common_word_ratio_min,common_word_ratio_max,common_word_withoutstopword_num,common_word_withoutstopword_ratio,common_word_withoutstopword_ratio_min,common_word_withoutstopword_ratio_max
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,65,56,14,12,26,2,1,0,...,6,0.461538,11,0.916667,1.000000,0.916667,5,0.833333,1.000000,0.833333
1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government ste...,48,85,10,15,25,5,1,0,...,11,0.680000,7,0.411765,0.700000,0.500000,4,0.363636,0.800000,0.400000
2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,70,54,14,10,24,4,1,0,...,8,0.791667,5,0.263158,0.500000,0.357143,3,0.375000,0.600000,0.500000
3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,48,58,11,13,24,2,0,0,...,9,0.833333,0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,73,38,13,7,20,6,1,0,...,13,0.800000,4,0.250000,0.571429,0.307692,2,0.153846,0.400000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,how many keywords are there in the racket prog...,how many keywords are there in perl program la...,80,74,14,13,27,1,1,1,...,8,0.518519,11,0.785714,0.916667,0.846154,6,0.750000,0.857143,0.857143
404283,do you believe there is life after death,is it true that there is life after death,40,41,8,9,17,1,0,1,...,4,0.647059,5,0.454545,0.625000,0.625000,2,0.500000,0.666667,0.666667
404284,what is one coin,what is this coin,16,17,4,4,8,0,1,1,...,2,0.625000,3,0.600000,0.750000,0.750000,1,0.500000,1.000000,0.500000
404285,what is the approx annual cost of live while s...,i am have little hairfall problem but i want t...,87,120,17,25,42,8,0,0,...,22,0.928571,1,0.025641,0.058824,0.043478,0,0.000000,0.000000,0.000000


In [23]:
fs1_train.shape

(404287, 21)

In [24]:
fs1_train.drop(["question1_final", "question2_final"], axis=1, inplace=True)
fs1_test.drop(["question1_final", "question2_final"], axis=1, inplace=True)

In [25]:
fs1_train

Unnamed: 0,q1_char_num,q2_char_num,q1_word_num,q2_word_num,total_word_num,differ_word_num,same_first_word,same_last_word,total_unique_word_num,total_unique_word_withoutstopword_num,total_unique_word_num_ratio,common_word_num,common_word_ratio,common_word_ratio_min,common_word_ratio_max,common_word_withoutstopword_num,common_word_withoutstopword_ratio,common_word_withoutstopword_ratio_min,common_word_withoutstopword_ratio_max
0,65,56,14,12,26,2,1,0,12,6,0.461538,11,0.916667,1.000000,0.916667,5,0.833333,1.000000,0.833333
1,48,85,10,15,25,5,1,0,17,11,0.680000,7,0.411765,0.700000,0.500000,4,0.363636,0.800000,0.400000
2,70,54,14,10,24,4,1,0,19,8,0.791667,5,0.263158,0.500000,0.357143,3,0.375000,0.600000,0.500000
3,48,58,11,13,24,2,0,0,20,9,0.833333,0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000
4,73,38,13,7,20,6,1,0,16,13,0.800000,4,0.250000,0.571429,0.307692,2,0.153846,0.400000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,80,74,14,13,27,1,1,1,14,8,0.518519,11,0.785714,0.916667,0.846154,6,0.750000,0.857143,0.857143
404283,40,41,8,9,17,1,0,1,11,4,0.647059,5,0.454545,0.625000,0.625000,2,0.500000,0.666667,0.666667
404284,16,17,4,4,8,0,1,1,5,2,0.625000,3,0.600000,0.750000,0.750000,1,0.500000,1.000000,0.500000
404285,87,120,17,25,42,8,0,0,39,22,0.928571,1,0.025641,0.058824,0.043478,0,0.000000,0.000000,0.000000


In [26]:
fs1_train.shape

(404287, 19)

In [16]:
train_data.drop(["question1_final", "question2_final"], axis=1, inplace=True)
test_data.drop(["question1_final", "question2_final"], axis=1, inplace=True)

In [20]:
train_data.shape

(404287, 9)

In [27]:
fuzzy_train_features = pd.concat([fs1_train, train_data], axis=1)
fuzzy_test_features = pd.concat([fs1_test, test_data], axis=1)

In [28]:
fuzzy_train_features

Unnamed: 0,q1_char_num,q2_char_num,q1_word_num,q2_word_num,total_word_num,differ_word_num,same_first_word,same_last_word,total_unique_word_num,total_unique_word_withoutstopword_num,...,common_word_withoutstopword_ratio_max,fuzz_ratio,fuzz_QRatio,fuzz_WRatio,fuzz_partial_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,longest_substr_ratio
0,65,56,14,12,26,2,1,0,12,6,...,0.833333,93,93,95,100,100,93,100,89,0.982456
1,48,85,10,15,25,5,1,0,17,11,...,0.400000,63,63,86,73,86,62,100,73,0.571429
2,70,54,14,10,24,4,1,0,19,8,...,0.500000,52,52,69,56,73,65,100,69,0.181818
3,48,58,11,13,24,2,0,0,20,9,...,0.000000,36,36,36,40,37,36,37,38,0.040816
4,73,38,13,7,20,6,1,0,16,13,...,0.200000,45,45,86,55,67,47,100,63,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404282,80,74,14,13,27,1,1,1,14,8,...,0.857143,91,91,91,86,96,87,100,82,0.413333
404283,40,41,8,9,17,1,0,1,11,4,...,0.666667,72,72,75,72,79,69,100,70,0.634146
404284,16,17,4,4,8,0,1,1,5,2,...,0.500000,79,79,82,75,86,79,100,75,0.470588
404285,87,120,17,25,42,8,0,0,39,22,...,0.000000,42,42,45,46,46,47,100,45,0.056818


In [29]:
fuzzy_train_features.shape

(404287, 28)

In [30]:
fuzzy_train_features.to_csv("data/train_set2_features.csv", index=False)
fuzzy_test_features.to_csv("data/test_set2_features.csv", index=False)