In [17]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('wordnet') #For using WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/saikat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/saikat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
data=pd.read_csv(r'questions.csv')
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404351 entries, 0 to 404350
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404351 non-null  int64 
 1   qid1          404351 non-null  int64 
 2   qid2          404351 non-null  int64 
 3   question1     404350 non-null  object
 4   question2     404349 non-null  object
 5   is_duplicate  404351 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB
None
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_

In [7]:
data.isnull().value_counts()

id     qid1   qid2   question1  question2  is_duplicate
False  False  False  False      False      False           404348
                                True       False                2
                     True       False      False                1
Name: count, dtype: int64

In [15]:
duplicate_values=data[data['is_duplicate']==0].value_counts()
dissimilar_values=data[data['is_duplicate']!=0].value_counts()

print(f'Duplicate question:{duplicate_values}\nDissimilar Questions:{dissimilar_values}')

Duplicate question:id      qid1    qid2    question1                                                                                                                          question2                                                                                                                                  is_duplicate
0       1       2       what is the step by step guide to invest in share market in india                                                                  what is the step by step guide to invest in share market                                                                                   0               1
270616  532248  532249  do indian family in the earn more than jewish family                                                                               are nris and second generation immigrant family more casteist than regular indian                                                          0               1
270601  482374  532219  what is that business that

In [8]:
#Removing any rows with even a single null value.
data=data.dropna(axis=0)  #axis:0 in pandas mean along the row.
data.isnull().value_counts()

id     qid1   qid2   question1  question2  is_duplicate
False  False  False  False      False      False           404348
Name: count, dtype: int64

In [9]:
def text_preprocessor(text):
    lemmatizer=WordNetLemmatizer()
    tokens=word_tokenize(text)
    tokens=[word.lower() for word in tokens if word.isalpha() and len(word)>=2]
    tokens=[lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

#Checking the function.
text_preprocessor("Hello what a lovely Morning!")

'hello what lovely morning'

***Now we will apply the previous function in our question columns.***

In [10]:
data[['question1', 'question2']] = data[['question1', 'question2']].applymap(text_preprocessor)

data.head()

  data[['question1', 'question2']] = data[['question1', 'question2']].applymap(text_preprocessor)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor diamond,what would happen if the indian government sto...,0
2,2,5,6,how can increase the speed of my internet conn...,how can internet speed be increased by hacking...,0
3,3,7,8,why am mentally very lonely how can solve it,find the remainder when math is divided by,0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


***Now we will extract features using TF-IDF.***

In [11]:
'''Since the vocab length of question1 and question2 column could be different so the to creating same length feature vector for all questions
I created a stack of all the question and thereafter created the feature vector.After that sliced the qs1 and qs2 coulmns from this stacked column of features.'''
#Here first we are creating the question stack.
question_stack=pd.concat([data['question1'],data['question2']])
print(question_stack.shape)

#Here I am considering an unigram model.
tf_vectorizor=TfidfVectorizer(ngram_range=(1,1))
qs_vector=tf_vectorizor.fit_transform(question_stack)

qs1_vector,qs2_vector=qs_vector[:data['question1'].shape[0]],qs_vector[data['question1'].shape[0]:]

print(f'Question1 column feature space shape:{qs1_vector.shape}\nQuestion2 column feature space shape:{qs2_vector.shape}')

(808696,)
Question1 column feature space shape:(404348, 68844)
Question2 column feature space shape:(404348, 68844)


***Now we will be applying cosine similarity to measure the similarity of the questions.***

In [12]:
#Scikit-learn's cosine similarity returns similarity scores for each i th row of X matrix to all row of Y matrix.
#That is why I'will iterate through each row and then perform similarity.
similarity_score_ls=[]
for i in range(qs1_vector.shape[0]):
    similarity_score=cosine_similarity(qs1_vector[i],qs2_vector[i])
    similarity_score_ls.append(similarity_score)
print(similarity_score_ls)    

[array([[0.97621781]]), array([[0.58910645]]), array([[0.25586544]]), array([[0.]]), array([[0.25900889]]), array([[0.49499375]]), array([[0.]]), array([[0.74844242]]), array([[0.9671208]]), array([[0.418497]]), array([[0.02132997]]), array([[0.60995915]]), array([[0.94441307]]), array([[0.95050191]]), array([[0.93514125]]), array([[0.28774338]]), array([[1.]]), array([[0.26416568]]), array([[0.38068313]]), array([[0.48421359]]), array([[0.53250332]]), array([[0.51058294]]), array([[0.82058701]]), array([[0.]]), array([[0.28313272]]), array([[0.90525784]]), array([[0.75144062]]), array([[0.4509908]]), array([[0.74704422]]), array([[0.62319545]]), array([[0.52895848]]), array([[0.54995238]]), array([[0.85468427]]), array([[0.00700222]]), array([[0.73632397]]), array([[0.46221359]]), array([[0.38650255]]), array([[0.38501382]]), array([[0.40858094]]), array([[0.18921776]]), array([[0.35710639]]), array([[0.98246055]]), array([[1.]]), array([[0.47117102]]), array([[0.75879054]]), array([[

In [23]:
similarity_score_ls[:5]

[array([[0.97621781]]),
 array([[0.58910645]]),
 array([[0.25586544]]),
 array([[0.]]),
 array([[0.25900889]])]

Checking the accuracy.

In [26]:
similarity_score_arr=np.array(similarity_score_ls).reshape(-1)
similarity_score_arr.shape
similarity_score_arr[:5]

true_label=np.array(data['is_duplicate'])

print(f'Predicted similarty matrix shape:{similarity_score_arr.shape}\nTrue Similarity matrix shape:{label.shape}')

Predicted similarty matrix shape:(404348,)
True Similarity matrix shape:(404348,)


In [28]:
#Changing the similarity score to 0 and 1 binary form.
similarity_score_arr[similarity_score_arr<=0.5],similarity_score_arr[similarity_score_arr>0.5]=1,0

similarity_score_arr[:5]

array([0., 0., 0., 0., 0.])

In [29]:
#Checking the accuracy.
print(f"Accuracy using TF-IDF features is:{accuracy_score(true_label,similarity_score_arr)}")

Accuracy using TF-IDF features is:0.6307487609682748
