Read in the training set

In [None]:
import pandas as pd
Train = pd.read_csv("/content/train.csv")
#Use this line when we just want to train on a subset of the training set.
Train = Train.iloc[1:4000,]

Pre-Processing of the training set

In [None]:
#1. Lowercase all questions and remove punctuation

#Remark: We need to remove puntations before removing empty columns, because some questions only contains puntuations and should be considered missing values. However, they will not be dropped if the puntutations are present when we executed the drop command

##To remove punctuation: We will use a code provided by CHATGPT

import string

def remove_punctuation(text):
    # Create a translation table to remove punctuation characters
    translator = str.maketrans('', '', string.punctuation)

    # Use the translation table to remove punctuation
    text_without_punctuation = text.translate(translator)

    return text_without_punctuation

#For each question: Set all to lowercase then remove puntuation, and replace it in the original Train dataframe

for i in range(len(Train["question1"])):
    Train["question1"].iloc[i] = remove_punctuation(Train["question1"].iloc[i].lower())
    Train["question2"].iloc[i] = remove_punctuation(Train["question2"].iloc[i].lower())

#2. Remove rows with missing data
Train = Train.dropna()
Train = Train[~Train.apply(lambda row: row.str.strip().str.len().eq(0).any(), axis=1)]

#2. Extract the questions
##Extract the first questions and store it in a variable
Q1 = Train["question1"].copy()
##Extract the second questions and store it in a variable
Q2 = Train["question2"].copy()





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Train["question1"].iloc[i] = remove_punctuation(Train["question1"].iloc[i].lower())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Train["question2"].iloc[i] = remove_punctuation(Train["question2"].iloc[i].lower())


Tokenize the questions in each question pair

In [None]:
#To tokenize the questions, we will use a pre-trained tokenizer model from the spaCy library
import spacy
#From the spacy library: Import a pre-trained tokenizer
Tokenizer = spacy.load("en_core_web_sm")
#Extract the tokens from each of the questions

##Create a copy of each of the questions
Q1_Tokens = Q1.copy()
Q2_Tokens = Q2.copy()
#For each question:
for i in range(len(Q1_Tokens)):
    #Tokenize each question and overwrite each question with its token
    Q1_Tokens.iloc[i] = [token.text for token in Tokenizer(Q1_Tokens.iloc[i])]
    Q2_Tokens.iloc[i] = [token.text for token in Tokenizer(Q2_Tokens.iloc[i])]

Create embeddings using Word2Vec

In [None]:
from gensim.models import Word2Vec
Q1_Embedded = Q1_Tokens.copy()
Q2_Embedded = Q2_Tokens.copy()
#For each question: Embed each word using Word2Vec
for i in range(len(Q1_Embedded)):
    #print(i)
    Q1_Embedded.iloc[i] = Word2Vec([Q1_Embedded.iloc[i]],vector_size=100, window=5, min_count=1, sg=0)
    Q2_Embedded.iloc[i] = Word2Vec([Q2_Embedded.iloc[i]],vector_size=100, window=5, min_count=1, sg=0)

#Aggregate each question to get a vector representation of each question, using the average method

#For each question pair
for i in range(len(Q1_Embedded)):

    #For each question: Extract the vector representation for each word and average them

    #Create a tracker that will store the running sum of the word's vectors in the first question in this pair
    Sum_Q1 = 0
    #For each word in the first question in this pair
    for word in Q1_Tokens.iloc[i]:
        #Extract the vector and add it to the running sum
        Sum_Q1 = Sum_Q1 + Q1_Embedded.iloc[i].wv[word]
    #Find the average of these sums ~ This is the vector representation of the first question in this pair
    Sum_Q1 = Sum_Q1/len(Q1_Tokens)
    #Overwrite the list with this new vector
    Q1_Embedded.iloc[i] = Sum_Q1
    #Create a tracker that will store the running sum of the word's vectors in the second question in this pair
    Sum_Q2 = 0
    #For each word in the second question in this pair
    for word in Q2_Tokens.iloc[i]:
        #Extract the vector and add it to the running sum
        Sum_Q2 = Sum_Q2 + Q2_Embedded.iloc[i].wv[word]
    #Find the average of these sums ~ This is the vector representation of the second question in this pair
    Sum_Q2 = Sum_Q2/len(Q2_Tokens)
    #Overwrite the list wih this vector
    Q2_Embedded.iloc[i] = Sum_Q2


Initialization of what is the underlying model we will use in the Siamese architecture


In [None]:
#Creating a multi-layer perceptron neural network that will be used as the underlying architecture

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import cosine_similarity

#Identify the length of the vectors used in the embeddings
embedding_dim = 100

#Identify the number of questions. This will indicate how many nodes we will use
Questions_Count = Q2_Embedded.shape[0]

#This indicates that we will be using 4 layers in our underlying architecture. ~ Can be increased/decreased
shared_network = keras.Sequential([
    layers.Dense(Questions_Count, activation='relu', input_shape=(embedding_dim,)),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),
    layers.Dense(Questions_Count, activation='relu'),

])


Initialize the Siamese network architecture with the above model as the underlying architecture

In [None]:
#Initialize the Siamese network architecture

#Define the left and right inputs for the question pair ~ Initializing how long the vectors that represent each question is
left_input = layers.Input(shape=(embedding_dim,))
right_input = layers.Input(shape=(embedding_dim,))

# Encode the question pair using the shared network ~ Indicate that we will input the questions both question in the pair into the shared network
encoded_left = shared_network(left_input)
encoded_right = shared_network(right_input)

# Calculate the Euclidean distance between the encodings ~ I.e: We will use this distance function to determine the similarity between the outputs
distance = layers.Lambda(lambda x: tf.norm(x[0] - x[1], axis=1, keepdims=True))([encoded_left, encoded_right])
#distance=layers.Lambda(lambda x: cosine_similarity(x[0], x[1]))([encoded_left, encoded_right])

# We have initialized the model. Now, just create the Siamese model
siamese_model = keras.Model(inputs=[left_input, right_input], outputs=distance)


#Compile the model ~ Specifies how the training should be done etc.....
siamese_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#siamese_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])


Training of the model

In [None]:
#Inputs has to be an array of a list of lists -> Convert from array of list to a list

Q1_Inputs = []

for i in range(len(Q1_Embedded)):
    Q1_Inputs.append(Q1_Embedded.iloc[i].tolist())

Q1_Inputs = np.array(Q1_Inputs)

Q2_Inputs = []

for i in range(len(Q2_Embedded)):
    Q2_Inputs.append(Q2_Embedded.iloc[i].tolist())

Q2_Inputs = np.array(Q2_Inputs)


#Training of the model

siamese_model.fit(
    [Q1_Inputs, Q2_Inputs],  # Your question embeddings
    np.array(Train["is_duplicate"]),  # Similarity labels (0 for dissimilar, 1 for similar)
    batch_size=32,
    epochs=10,
    validation_split=0.2  # You can adjust the validation split
)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ab470203970>

Finding the fitted values/Predicting the training points

In [None]:
Results = (siamese_model.predict([Q1_Inputs, Q2_Inputs]))
Prediction = []
for i in range(len(Results)):
    if Results[i]>0.5:
        Prediction.append(1)
    else:
        Prediction.append(0)

#Calculating training accuracy:

Score = 0

for i in range(len(Prediction)):
    if Prediction[i] == Train["is_duplicate"].tolist()[i]:
        Score = Score + 1

print("Training Accuracy is")
print(Score/len(Prediction))

Training Accuracy is
0.5932966483241621


In [None]:
print(sum(Prediction))

502
