In [132]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('wordnet') #For using WordNetLemmatizer
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
data=pd.read_csv(r'/kaggle/input/question-pair-dataset/questions.csv')
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404351 entries, 0 to 404350
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404351 non-null  int64 
 1   qid1          404351 non-null  int64 
 2   qid2          404351 non-null  int64 
 3   question1     404350 non-null  object
 4   question2     404349 non-null  object
 5   is_duplicate  404351 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB
None
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_

In [4]:
data.isnull().value_counts()

id     qid1   qid2   question1  question2  is_duplicate
False  False  False  False      False      False           404348
                                True       False                2
                     True       False      False                1
Name: count, dtype: int64

In [5]:
duplicate_values=data[data['is_duplicate']==0].value_counts()
dissimilar_values=data[data['is_duplicate']!=0].value_counts()

print(f'Duplicate question:{duplicate_values}\nDissimilar Questions:{dissimilar_values}')

Duplicate question:id      qid1    qid2    question1                                                                                                                                             question2                                                                                                                                             is_duplicate
0       1       2       What is the step by step guide to invest in share market in india?                                                                                    What is the step by step guide to invest in share market?                                                                                             0               1
270616  532248  532249  Do Indian families in the U.S. earn more than Jewish families?                                                                                        Are NRIs and second generation immigrant families more casteist than regular Indians?                                           

In [5]:
#Removing any rows with even a single null value.
data=data.dropna(axis=0)  #axis:0 in pandas mean along the row.
data.isnull().value_counts()

id     qid1   qid2   question1  question2  is_duplicate
False  False  False  False      False      False           404348
Name: count, dtype: int64

In [6]:
def text_preprocessor(text):
#     lemmatizer=WordNetLemmatizer()
    stop_words=set(stopwords.words('english'))
    tokens=word_tokenize(text)
    tokens=[word.lower() for word in tokens if word.isalpha() and len(word)>=2]
    tokens=[word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

#Checking the function.
text_preprocessor("Hello what a lovely Morning!")

'hello lovely morning'

***Now we will apply the previous function in our question columns.***

In [7]:
data[['question1', 'question2']] = data[['question1', 'question2']].applymap(text_preprocessor)

data.head()

  data[['question1', 'question2']] = data[['question1', 'question2']].applymap(text_preprocessor)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,step step guide invest share market india,step step guide invest share market,0
1,1,3,4,story kohinoor diamond,would happen indian government stole kohinoor ...,0
2,2,5,6,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,3,7,8,mentally lonely solve,find remainder math divided,0
4,4,9,10,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


***Now we will extract features using TF-IDF.***

In [8]:
'''Since the vocab length of question1 and question2 column could be different so the to creating same length feature vector for all questions
I created a stack of all the question and thereafter created the feature vector.After that sliced the qs1 and qs2 coulmns from this stacked column of features.'''
#Here first we are creating the question stack.
question_stack=pd.concat([data['question1'],data['question2']])
print(question_stack.shape)

#Here I am considering an unigram model.
tf_vectorizor=TfidfVectorizer(ngram_range=(1,1))
qs_vector=tf_vectorizor.fit_transform(question_stack)

qs1_vector,qs2_vector=qs_vector[:data['question1'].shape[0]],qs_vector[data['question1'].shape[0]:]

print(f'Question1 column feature space shape:{qs1_vector.shape}\nQuestion2 column feature space shape:{qs2_vector.shape}')

(808696,)
Question1 column feature space shape:(404348, 76093)
Question2 column feature space shape:(404348, 76093)


***Now we will be applying cosine similarity to measure the similarity of the questions.***

In [30]:
#Scikit-learn's cosine similarity returns similarity scores for each i th row of X matrix to all row of Y matrix.
#That is why I'will iterate through each row and then perform similarity.
similarity_score_ls=[]
for i in range(qs1_vector.shape[0]):
    similarity_score=cosine_similarity(qs1_vector[i],qs2_vector[i])
    similarity_score_ls.append(similarity_score)
print(similarity_score_ls[:5])    

[array([[0.97621781]]), array([[0.58910645]]), array([[0.25586544]]), array([[0.]]), array([[0.25900889]])]


***Checking the accuracy.***

In [26]:
similarity_score_arr=np.array(similarity_score_ls).reshape(-1)
similarity_score_arr.shape
similarity_score_arr[:5]

true_label=np.array(data['is_duplicate'])

print(f'Predicted similarty matrix shape:{similarity_score_arr.shape}\nTrue Similarity matrix shape:{label.shape}')

Predicted similarty matrix shape:(404348,)

True Similarity matrix shape:(404348,)


In [28]:
#Changing the similarity score to 0 and 1 binary form.
similarity_score_arr[similarity_score_arr<=0.5],similarity_score_arr[similarity_score_arr>0.5]=1,0

similarity_score_arr[:5]

array([0., 0., 0., 0., 0.])

In [29]:
#Checking the accuracy.
print(f"Accuracy using TF-IDF features is:{accuracy_score(true_label,similarity_score_arr)}")

Accuracy using TF-IDF features is:0.6307487609682748


***Now we will create word2vec features.***

In [17]:
print(question_stack.shape[0])
question_stack=pd.DataFrame({'questions':pd.concat([data['question1'],data['question2']])})
print(question_stack.head())

808697
                                           questions
0          step step guide invest share market india
1                             story kohinoor diamond
2       increase speed internet connection using vpn
3                              mentally lonely solve
4  one dissolve water quikly sugar salt methane c...


In [19]:
#Creating tokenized version of all the question from question stack to pass into the word2vec function.
question_stack['tokens']=question_stack['questions'].apply(word_tokenize)

question_stack.head()

Unnamed: 0,questions,tokens
0,step step guide invest share market india,"[step, step, guide, invest, share, market, india]"
1,story kohinoor diamond,"[story, kohinoor, diamond]"
2,increase speed internet connection using vpn,"[increase, speed, internet, connection, using,..."
3,mentally lonely solve,"[mentally, lonely, solve]"
4,one dissolve water quikly sugar salt methane c...,"[one, dissolve, water, quikly, sugar, salt, me..."


In [20]:
import gensim
from gensim.models import Word2Vec

In [118]:
#Training Word2vec model.
model=Word2Vec(sentences=question_stack['tokens'],workers=4,vector_size=2000)

Creating a function to train and use the word2vec model.

In [119]:
def create_features(sentence:list):
    word_embeddings = [model.wv[word] for word in sentence if word in model.wv]
    if not word_embeddings:
        return np.zeros(model.vector_size)

    return np.mean(word_embeddings, axis=0)

Applying the previous function on the question column.

In [120]:
qs_vector=[]
for question in question_stack['tokens']:
    qs_vector.append(create_features(question))

qs_vector=np.array(qs_vector)    
    
qs_vector.shape    

(808696, 2000)

In [121]:
question_stack['token_feature']=list(qs_vector)

question_stack.head()

Unnamed: 0,questions,tokens,token_feature
0,step step guide invest share market india,"[step, step, guide, invest, share, market, india]","[0.1264660656452179, -0.01943221502006054, 0.0..."
1,story kohinoor diamond,"[story, kohinoor, diamond]","[0.15774165093898773, -0.039083659648895264, 0..."
2,increase speed internet connection using vpn,"[increase, speed, internet, connection, using,...","[-0.053026679903268814, 0.10246127843856812, -..."
3,mentally lonely solve,"[mentally, lonely, solve]","[-0.017327310517430305, 0.02706994116306305, 0..."
4,one dissolve water quikly sugar salt methane c...,"[one, dissolve, water, quikly, sugar, salt, me...","[0.11025995761156082, -0.09714525938034058, -0..."


In [122]:
# Calculate midpoint
midpoint = qs_vector.shape[0] // 2


# Slice the DataFrame
qs1_vector = qs_vector[:midpoint]
qs2_vector = qs_vector[midpoint:]

print(f'Question1 column feature space shape:{qs1_vector.shape}\nQuestion2 column feature space shape:{qs2_vector.shape}')

Question1 column feature space shape:(404348, 2000)
Question2 column feature space shape:(404348, 2000)


***Performing Similarity Score openration on word2vec features.***

In [124]:
qs1_vector[1].reshape(1,2000).shape

(1, 2000)

In [125]:
similarity_score_ls=[]
for i in range(qs1_vector.shape[0]):
    similarity_score=cosine_similarity(qs1_vector[i].reshape(1,2000),qs2_vector[i].reshape(1,2000))
    similarity_score_ls.append(similarity_score)
print(similarity_score_ls[:5]) 

[array([[0.96068549]]), array([[0.07982276]]), array([[0.87050168]]), array([[0.33990515]]), array([[0.81002504]])]


In [126]:
np.isnan(true_label).sum()
np.isnan(similarity_score_arr).sum()

0

In [128]:
# Flatten the list of arrays into a 1D array
similarity_score_arr = np.array([score[0][0] for score in similarity_score_ls])
similarity_score_arr.shape

(404348,)

In [129]:
similarity_score_arr[:5]

array([0.96068549, 0.07982276, 0.87050168, 0.33990515, 0.81002504])

In [146]:
true_label=np.array(data['is_duplicate'])

# Changing the similarity score to 0 and 1 binary form.
similarity_score_arr_binary = np.where(similarity_score_arr <= 0.5, 1, 0)

similarity_score_arr[:5]
print(f'Predicted similarty matrix shape:{similarity_score_arr_binary.shape}\nTrue Similarity matrix shape:{true_label.shape}')

#Checking the accuracy.
print(f"Accuracy using Word2Vec features is:{accuracy_score(true_label,similarity_score_arr_binary)}")
print(f'\nClassifaction Repport:{classification_report(true_label,similarity_score_arr_binary)}')

Predicted similarty matrix shape:(404348,)
True Similarity matrix shape:(404348,)
Accuracy using Word2Vec features is:0.473807215566789

Classifaction Repport:              precision    recall  f1-score   support

           0       0.56      0.74      0.64    255042
           1       0.05      0.03      0.04    149306

    accuracy                           0.47    404348
   macro avg       0.31      0.38      0.34    404348
weighted avg       0.38      0.47      0.42    404348



In [141]:
duplicate=np.sum(similarity_score_arr_binary==0)
duplicate

211855

In [87]:
print(np.sum(true_label==0))

255042
