In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('train.csv')

In [6]:
df.shape

(404290, 6)

In [7]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [8]:
# Since there are 4L data items and we just wish to see how good a model will perform without any feature engineering, we are taking only 30k data items here for our experiment. 
# Using 4L datapoints will consume a lot of time for the experimentation purpose

new_df = df.sample(30000)

In [9]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [10]:
new_df.duplicated().sum()

0

In [11]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
171604,What are some great achievements of you?,What are some great achievements of the UN?
388476,Why haven't humans gone beyond low-Earth orbit...,Why hasn't anyone landed on the Moon in over 4...
218751,Where can I find free online coaching for IAS?,Which is the best online coaching for IAS irre...
243026,Who would win in a war between China and Russia?,"Who would win in a Twitter war, NATO, Russia o..."
321165,What are some simple and good topics for resea...,How big is the uni?


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [13]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(30000, 6000)

In [14]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
171604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
243026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
321165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
105080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
152050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
216444,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
temp_df['is_duplicate'] = new_df['is_duplicate']

In [16]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
171604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388476,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
218751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
243026,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
321165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7388333333333333

In [19]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
accuracy_score(y_test,y_pred)

0.6816666666666666

**We can see that even without any feature engineering, just by applying BoW we are getting 73% and 68% accuracy with Random Forest and XGBoost algo.**