In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/question-pairs-dataset/questions.csv


# 1. Question 1: Vector Embeddings

## 1.1 Data Setup

In [2]:
#Load the Quora Question Pairs dataset
df = pd.read_csv("/kaggle/input/question-pairs-dataset/questions.csv")

In [3]:
#view the first few records
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
#Handle the missing data by dropping the NaN values in Question1 and Question2
df = df.dropna(subset=["question1", "question2"])

## 1.2 Text Preprocessing & Embedding

### 1. Preprocessing:
- Preprocess your data with whichever methods you deem fit for cleaning text

In [5]:
import re
def preporcessing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
df["question1"] = df["question1"].apply(preporcessing)
df["question2"] = df["question2"].apply(preporcessing)

In [6]:
#After preprocessing
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


### 2. Choose an Embedding Method:
- You may use pretrained embeddings (e.g., GloVe, Tf-idf, fastText, etc.) via gensim or any other
library

In [7]:
import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np


In [8]:
# Load pretrained GloVe embeddings
print("Loading pre-trained Glove embeddings")
glove_vectors = api.load("glove-wiki-gigaword-50")
print("Model Loaded successfully")

Loading pre-trained Glove embeddings
Model Loaded successfully


### 3. Vector Representation:
- For each question (you can combine question1 and question2 into one combined column, or continue
to treat them separately), embed the question into a vector.
- Construct a matrix (num questions, embedding dimension) representing all the questions

In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def get_sentence_embedding(sentence, model):
    words = word_tokenize(sentence)  # More robust tokenization
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        
        return np.zeros(model.vector_size)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Combine questions into a single column
df["combined_question"] = df["question1"] + " " + df["question2"]

In [11]:
# Generate embeddings for combined questions
df["combined_embedding"] = df["combined_question"].apply(lambda x: get_sentence_embedding(x, glove_vectors))

In [12]:
# Construct embedding matrix
embedding_matrix = np.vstack(df["combined_embedding"].values)

# Display sample results
print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (404348, 50)


In [13]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,combined_question,combined_embedding
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,what is the step by step guide to invest in sh...,"[0.42206758, 0.030950079, 0.04003307, 0.052157..."
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0,what is the story of kohinoor kohinoor diamond...,"[0.16032724, 0.052564774, -0.21682666, 0.15706..."
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,how can i increase the speed of my internet co...,"[0.431272, 0.06882063, 0.42122915, 0.0619536, ..."
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0,why am i mentally very lonely how can i solve ...,"[0.47216088, -0.022894742, 0.02107759, -0.2982..."
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,which one dissolve in water quikly sugar salt ...,"[0.45246294, 0.21187137, -0.13217072, 0.040367..."


## 1.3 Cosine Similarity and Nearest Neighbors


### 1. Choose 5 “query” questions from your dataset.


In [14]:
# Choose 5 query questions (random selection or manual selection)
query_indices = np.random.choice(len(df), 5, replace=False)
query_embeddings = embedding_matrix[query_indices]
query_questions = df.iloc[query_indices]["combined_question"].values
print(query_questions)

['what are the best ways to transition from negative plane to positive in your life if a person want to take a indian citizenship but there is no relation in india can he apply for indian citizenship'
 'i am expecting 430 neet 2016i am genral will i get college in uttar pradesh im 13 and my penis is 12 inches is that unnatural and should i be embarrassed by it'
 'which is a good solar panel installation provider near meadow vista california ca which is a good solar panel installation provider near joshua tree california ca'
 'if i have root access to a managed chromebook can i manually change my desktop background should i play dota 2csgo or league of legends'
 'why does it rain so heavy why does the road become bad after a heavy rain']


### 2. For each query question, compute cosine similarity between its vector embedding and the embedding of every other question in the dataset.


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# Compute cosine similarity between query embeddings and all embeddings
similarity_matrix = cosine_similarity(query_embeddings, embedding_matrix)
print(similarity_matrix)

[[0.9621157  0.9211901  0.94247156 ... 0.87754947 0.9718386  0.96158016]
 [0.8992047  0.87576747 0.8848873  ... 0.8647532  0.9539095  0.9319092 ]
 [0.8923907  0.85025847 0.88389    ... 0.81002593 0.91239125 0.8715531 ]
 [0.90294784 0.87889487 0.9466192  ... 0.89385116 0.95213276 0.9480389 ]
 [0.9078862  0.903724   0.9073947  ... 0.86748594 0.94491893 0.9282828 ]]


### 3. Retrieve the top-5 most similar questions for each query question, based on cosine similarity.


In [17]:
# Retrieve top-5 most similar questions for each query
for i, query in enumerate(query_questions):
    similar_indices = np.argsort(similarity_matrix[i])[::-1][1:6]  # Exclude self-match (highest score)
    similar_questions = df.iloc[similar_indices]["combined_question"].values
    print(f"Query {i+1}: {query}")
    print("Top 5 Similar Questions:")
    for j, sim_q in enumerate(similar_questions):
        print(f"{j+1}. {sim_q}")
    print("-" * 80)

Query 1: what are the best ways to transition from negative plane to positive in your life if a person want to take a indian citizenship but there is no relation in india can he apply for indian citizenship
Top 5 Similar Questions:
1. what is the greatest number of citizenships ever held by one person if a person want to take a indian citizenship but there is no relation in india can he apply for indian citizenship
2. how would it affect the election if trump said i want to be a better person let me start by apologizing for making an issue of obamas birth what is the main difference between indian schooling and european schooling
3. how many days it takes for the issue of a passport in india how long does it take to get a passport in india
4. what are the best activities to do with children when visiting ogbomosho oyo nigeria is it possible that a person with all the necessary qualifications is not admitted to any inns of court
5. how easy is it to obtain a fake scst certificate in ind

### 4. Discuss whether these nearest neighbors make sense. Are they semantically similar or related in any obvious way?

- The embeddings do a reasonable job of capturing broad topical similarities but struggle with fine-grained semantic distinctions.
- Queries about widely discussed topics (e.g., education, Quora ranking) return better matches than niche topics (e.g., business ideas, astrologers).
- Some irrelevant matches likely arise due to token-based similarity rather than true semantic understanding.
- 
### Let's see the queries

#### Query 1: Social Media Experts

- The retrieved questions are somewhat relevant, as they discuss social media marketing, tools, and agencies.
- However, some results are loosely connected, such as discussions about social media campaigns rather than experts specifically.
- The model does capture some level of topical similarity, but it's not perfectly refined.
#### Query 2: Technology & Education

- The top three retrieved questions are highly relevant, as they directly address how technology impacts education.
- The last two results diverge slightly into social development and computer education, which are related but not exact matches.
- Overall, this set of nearest neighbors demonstrates strong semantic similarity.
#### Query 3: Quora Ranking System

- Some retrieved questions are related to Quora, answer ranking, and responses, which align well with the query.
- Others (e.g., job interview questions) deviate significantly, suggesting the model is picking up on general themes of ranking or responses rather than the specific context of Quora.


# Question 2: Neural Networks

## 2.1 Data Preparation & Splitting

### 1. Keep the original structure of the dataset

In [18]:
df_new = df[['question1', 'question2', 'is_duplicate']]
df_new.head()

Unnamed: 0,question1,question2,is_duplicate
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0
4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


### 2. Construct your feature vectors:
- Use the same embeddings you created in Question 1. For each question, you have a vector;
- you can then concatenate (or subtract, or otherwise combine) the embeddings of question1 and question2

In [19]:
# Generate embeddings for each question using the embedding used in question 1
df_new.loc[:, 'q1_vector'] = df_new['question1'].apply(lambda x: get_sentence_embedding(x, glove_vectors))
df_new.loc[:, 'q2_vector'] = df_new['question2'].apply(lambda x: get_sentence_embedding(x, glove_vectors))

# Function to combine embeddings using three strategies
def combine_embeddings(row):
    q1_vec = row['q1_vector']
    q2_vec = row['q2_vector']
    
    concatenation = np.concatenate((q1_vec, q2_vec))  # [q1, q2]
    subtraction = np.abs(q1_vec - q2_vec)             # [q1 - q2]
    multiplication = q1_vec * q2_vec                  # [q1 * q2]
    
    return np.concatenate((concatenation, subtraction, multiplication))  # Final combined vector

# Apply feature combination
df_new['combined_vector'] = df_new.apply(combine_embeddings, axis=1)

# Convert list of arrays into a 2D NumPy array for model training
X = np.stack(df_new['combined_vector'].values)  # Feature matrix
y = df_new['is_duplicate'].values  # Labels

# Print shape of final dataset
print(f"Feature Vector Shape: {X.shape}")  
print(f"Label Vector Shape: {y.shape}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:, 'q1_vector'] = df_new['question1'].apply(lambda x: get_sentence_embedding(x, glove_vectors))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:, 'q2_vector'] = df_new['question2'].apply(lambda x: get_sentence_embedding(x, glove_vectors))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Feature Vector Shape: (404348, 200)
Label Vector Shape: (404348,)


### 3. Split your dataset into train, validation, and test sets. A typical split might be 80%–10%–10%, bu you may adjust as needed

In [20]:
from sklearn.model_selection import train_test_split

# Split the dataset (80% train, 10% validation, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print dataset shapes
print(f"Training Set: {X_train.shape}, Validation Set: {X_val.shape}, Test Set: {X_test.shape}")

Training Set: (323478, 200), Validation Set: (40435, 200), Test Set: (40435, 200)
