# 1) Data Acquisition

In [1]:
!kaggle datasets download quora/question-pairs-dataset

Dataset URL: https://www.kaggle.com/datasets/quora/question-pairs-dataset
License(s): other
Downloading question-pairs-dataset.zip to /content
 53% 11.0M/20.8M [00:00<00:00, 54.0MB/s]
100% 20.8M/20.8M [00:00<00:00, 84.3MB/s]


In [2]:
!unzip question-pairs-dataset.zip

Archive:  question-pairs-dataset.zip
  inflating: questions.csv           


In [3]:
!rm -rf question-pairs-dataset.zip

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data = pd.read_csv('questions.csv')

In [6]:
df = data.sample(30000, random_state = 4)

# 2) Data Inspection

In [7]:
df.isnull().sum()

Unnamed: 0,0
id,0
qid1,0
qid2,0
question1,0
question2,0
is_duplicate,0


In [9]:
df.duplicated().sum()

0

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 373844 to 322461
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            30000 non-null  int64 
 1   qid1          30000 non-null  int64 
 2   qid2          30000 non-null  int64 
 3   question1     30000 non-null  object
 4   question2     30000 non-null  object
 5   is_duplicate  30000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 1.6+ MB


# 3) Data preparation

In [13]:
ques_df = df.iloc[:, [3,4]]

In [14]:
ques_df.head()

Unnamed: 0,question1,question2
373844,What is the average pay for a good freelancing...,Is freelancing feasible as a fresh-out-of-coll...
193268,Can I use a grill microwave oven to bake a piz...,How do you use a BBQ grill indoors?
24365,When should I lose my virginity to a guy?,How does a guy lose his virginity?
364159,What are some very specific verbs?,Are main verbs always dynamic verbs?
336821,Which is the best hatch back under RS 6 lakh?,Is it useful to get into PGP QSCM at NICMAR im...


In [15]:
ques_df.shape

(30000, 2)

# 4) Bag of words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 3000)

In [18]:
questions = list(ques_df.question1) + list(ques_df.question2)

In [19]:
questions

['What is the average pay for a good freelancing iOS developer?',
 "Can I use a grill microwave oven to bake a pizza? It doesn't have preheating and it is a grill only microwave, with Micro Grill, Grill & Combination Grill cooking methods.",
 'When should I lose my virginity to a guy?',
 'What are some very specific verbs?',
 'Which is the best hatch back under RS 6 lakh?',
 'Which smartphone has the best camera?',
 'How can someone become rich?',
 'If I had a car with 15 inch rims and changed them to 17 inch would I save fuel by having a larger rim, how does the different rim sizes affect fuel economy?',
 'Why do some people tailgate  and refuse to pass anyone?',
 'What is the future of MBBS doctor without FMGE MCI?',
 'What is difference between faith and belief?',
 'How long does it take for ear piercings to heal?',
 'What are some of the awesome places to visit in Konkan, Maharashtra?',
 'I am financially stuck in a half baked relationship. Can someone help, how can I get out of th

In [20]:
len(questions)

60000

In [22]:
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(), 2)

In [25]:
q1_df = pd.DataFrame(q1_arr, index = ques_df.index)
q2_df = pd.DataFrame(q2_arr, index = ques_df.index)

q_df = pd.concat([q1_df, q2_df], axis = 1)

In [26]:
q_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
373844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193268,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
24365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
q_df['is_duplicate'] = df.is_duplicate

In [28]:
q_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
373844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
193268,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
24365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
q_df.shape

(30000, 6001)

# 5) Model

## Random Forest

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x_train, x_test, y_train, y_test = train_test_split(q_df.iloc[:, 0:-1].values, q_df.iloc[:, -1].values, test_size = 0.2, random_state = 4)

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [33]:
rf = RandomForestClassifier()

In [34]:
rf.fit(x_train, y_train)

In [35]:
y_pred = rf.predict(x_test)

In [36]:
acc = accuracy_score(y_test, y_pred)

In [42]:
acc

0.7418333333333333

## XGBoost

In [37]:
from xgboost import XGBClassifier

In [38]:
xgb = XGBClassifier()

In [39]:
xgb.fit(x_train, y_train)

In [40]:
y_pred_2 = xgb.predict(x_test)

In [41]:
acc_2 = accuracy_score(y_test, y_pred_2)

In [43]:
acc_2

0.7273333333333334

In [44]:
print(f'Random Forest Accuracy: {acc}')
print(f'XGBoost Accuracy: {acc_2}')

Random Forest Accuracy: 0.7418333333333333
XGBoost Accuracy: 0.7273333333333334
