In [2]:
import pandas as pd
import numpy as np


import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras import backend as K

In [3]:
df = pd.read_csv('dataset/train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [4]:
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [5]:
print(df.shape)
df.dropna(axis=0,inplace=True)
print(df.shape)
df.isnull().sum()

(404290, 6)
(404287, 6)


id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [6]:
split = int(0.9 * len(df))
df_train = df[:split]
df_test_val = df[split:]
del(df)
print(df_train.shape,df_test_val.shape)

(363858, 6) (40429, 6)


In [7]:
split = int(0.5 * len(df_test_val))
df_val = df_test_val[:split]
df_test = df_test_val[split:]
del(df_test_val)
print(df_test.shape,df_val.shape)

(20215, 6) (20214, 6)


In [8]:
def process_dataset(df):
    '''
    Function to process the dataset to extract the Question Pairs 
    and the Labels.
    '''
    ques1 = df.question1.values
    ques2 = df.question2.values
    labels = df.is_duplicate.values
    return ques1,ques2,labels

In [9]:
Q1_train,Q2_train,Y_train = process_dataset(df_train)
for i in range(2):
    print('Question1 :',Q1_train[i])
    print('Question2 :',Q2_train[i])
    print('Is_Duplicate :',Y_train[i])

Question1 : What is the step by step guide to invest in share market in india?
Question2 : What is the step by step guide to invest in share market?
Is_Duplicate : 0
Question1 : What is the story of Kohinoor (Koh-i-Noor) Diamond?
Question2 : What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?
Is_Duplicate : 0


In [10]:
Q1_test,Q2_test,Y_test = process_dataset(df_test)
Q1_val,Q2_val,Y_val = process_dataset(df_val)

In [11]:
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(np.concatenate([Q1_train,Q2_train,Q1_val,Q2_val]))

In [12]:
vocab = tokenizer.word_index
train_Q1 = tokenizer.texts_to_sequences(Q1_train)
train_Q2 = tokenizer.texts_to_sequences(Q2_train)
val_Q1 = tokenizer.texts_to_sequences(Q1_val)
val_Q2 = tokenizer.texts_to_sequences(Q2_val)
test_Q1 = tokenizer.texts_to_sequences(Q1_test)
test_Q2 = tokenizer.texts_to_sequences(Q2_test)

In [13]:
train_Q1 = pad_sequences(train_Q1,maxlen=128,padding='post',truncating='post')
train_Q2 = pad_sequences(train_Q2,maxlen=128,padding='post',truncating='post')
val_Q1 = pad_sequences(val_Q1,maxlen=128,padding='post',truncating='post')
val_Q2 = pad_sequences(val_Q2,maxlen=128,padding='post',truncating='post')
test_Q1 = pad_sequences(test_Q1,maxlen=128,padding='post',truncating='post')
test_Q2 = pad_sequences(test_Q2,maxlen=128,padding='post',truncating='post')
print(train_Q1.shape,train_Q2.shape,val_Q1.shape,val_Q2.shape,test_Q1.shape,test_Q2.shape)

(363858, 128) (363858, 128) (20214, 128) (20214, 128) (20215, 128) (20215, 128)


In [14]:
def initialize_base_network(num_words = len(vocab)+1,embed_dim = 64):
    input = tf.keras.layers.Input(shape=(128,), name="base_input")
    x   = tf.keras.layers.Embedding(input_dim = num_words,output_dim=embed_dim)(input)
    x   = tf.keras.layers.LSTM(128,return_sequences=True)(x)
    #x   = tf.keras.layers.Dropout(0.1)(x)
    #x   = tf.keras.layers.LSTM(128,return_sequences=True)(x)
    #x   = tf.keras.layers.Dropout(0.1)(x)
    x   = tf.keras.layers.GlobalAveragePooling1D()(x)
    #x   = tf.keras.layers.Dense(64,activation='relu')(x)
    return tf.keras.models.Model(inputs=input, outputs=x)


def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [15]:
base_network = initialize_base_network()


In [16]:
base_network.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
base_input (InputLayer)      [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 64)           5973888   
_________________________________________________________________
lstm (LSTM)                  (None, 128, 128)          98816     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
Total params: 6,072,704
Trainable params: 6,072,704
Non-trainable params: 0
_________________________________________________________________


In [17]:
input_a = tf.keras.layers.Input(shape=(128,), name="left_input")
vect_output_a = base_network(input_a)

# create the right input and point to the base network
input_b = tf.keras.layers.Input(shape=(128,), name="right_input")
vect_output_b = base_network(input_b)

# measure the similarity of the two vector outputs
output = tf.keras.layers.Lambda(euclidean_distance, name="output_layer", output_shape=eucl_dist_output_shape)([vect_output_a, vect_output_b])

# specify the inputs and output of the model
model = tf.keras.models.Model([input_a, input_b], output)


In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
left_input (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
right_input (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
model (Model)                   (None, 128)          6072704     left_input[0][0]                 
                                                                 right_input[0][0]                
__________________________________________________________________________________________________
output_layer (Lambda)           (None, 1)            0           model[1][0]                

In [19]:
def contrastive_loss_with_margin(margin):
    def contrastive_loss(y_true, y_pred):
        '''Contrastive loss from Hadsell-et-al.'06
        http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
        '''
        square_pred = K.square(y_pred)
        margin_square = K.square(K.maximum(margin - y_pred, 0))
        return K.mean(y_true * square_pred + (1 - y_true) * margin_square)
    return contrastive_loss

In [20]:
Y_train = np.array(Y_train,dtype=np.float32)
Y_val = np.array(Y_val,dtype=np.float32)
Y_test = np.array(Y_test,dtype=np.float32)
Y_train.dtype,Y_val.dtype,Y_test.dtype

(dtype('float32'), dtype('float32'), dtype('float32'))

In [23]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss=contrastive_loss_with_margin(3))

In [24]:
model.fit(x=[train_Q1,train_Q2],y=Y_train,batch_size=2048,epochs=10,validation_data=([val_Q1,val_Q2],Y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a93596d490>

In [49]:
def predict(model,Q1,Q2,threshold=0.7):
  
    prediction = []
    
    v1 = model.predict(Q1)
    v2 = model.predict(Q2)
    
    for i in range(len(Q1)):
        
        dot_product = np.dot(v1[i],v2[i].T)
        cos_sim = dot_product/(np.linalg.norm(v1[i]) * np.linalg.norm(v2[i]))
        if cos_sim > threshold:
            score = 1
          
        else:
            score = 0
        
        prediction.append(score)
    return np.array(prediction)

In [30]:
df_test.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
384075,384075,408489,304764,What is the biggest lie you ever told to yours...,What is the biggest lie you have told yourself?,1
384076,384076,65043,277644,What are some tips on making it through the jo...,What are some tips on making it through the jo...,0
384077,384077,283606,276977,What are the questions asked in SSB interviews?,What are some questions asked in SSB interview?,1
384078,384078,516074,516075,My boyfriend bought two mice from a pet store ...,What is acoustic emanation?,0
384079,384079,516076,516077,Is there any chance of Eminem coming to India ...,What are the chances of Eminem coming and perf...,1


In [32]:
Y_test = df_test.is_duplicate.values

In [69]:
false_pos = []
false_neg = []
accuracy =[]
values = [0.8,0.825,0.85,0.875,0.9,0.925,0.94,0.95,0.96,0.97]
for thresh in values:
    Y_hat = predict(base_network,test_Q1,test_Q2,thresh)
    acc = np.sum(Y_hat == Y_test)/len(Y_test)
    accuracy.append(acc)
    fn = 0 
    fp = 0
    for x,y in zip(Y_hat,Y_test):
        
        if x!=y:
        
            if y == 1:
                fn += 1
            
            else:
                fp += 1
            
    false_pos.append(fp)
    false_neg.append(fn)
    
        

In [70]:
df_analyse = pd.DataFrame({'Threshold_Value':values,
                           'False_Positives':false_pos,
                           'False_Negatives':false_neg,
                           'Wrong Predictions': [x + y for x,y in zip(false_neg,false_pos)],
                           'Accuracy':accuracy})

In [71]:
df_analyse

Unnamed: 0,Threshold_Value,False_Positives,False_Negatives,Wrong Predictions,Accuracy
0,0.8,2755,1069,3824,0.810834
1,0.825,2406,1186,3592,0.82231
2,0.85,2070,1317,3387,0.832451
3,0.875,1761,1455,3216,0.84091
4,0.9,1443,1661,3104,0.846451
5,0.925,1168,1902,3070,0.848133
6,0.94,1005,2084,3089,0.847193
7,0.95,894,2220,3114,0.845956
8,0.96,780,2404,3184,0.842493
9,0.97,651,2609,3260,0.838734


### From The Above table we can see that the best Threshold Value for the Model will be in the range [0.92,0.94)
### The model Has an Accuracy of approximately 84%