In [1]:
import random
import numpy as np
import pandas as pd
import trax
import trax.layers as tl
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from trax.fastmath import numpy as fastnp

In [2]:
df = pd.read_csv('dataset/train.csv')
print(df.shape)
df.head()

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
split = int(0.8 * len(df))
df_test = df.iloc[split:]
df = df.iloc[:split]
print(df.shape)
df.head()

(323432, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df_filtered = df[df.is_duplicate == 1]
print('Shape of the new dataframe is ',df_filtered.shape)
print('Number of rows which have been filtered out are ',df.shape[0]-df_filtered.shape[0])
df_filtered.head()

Shape of the new dataframe is  (120176, 6)
Number of rows which have been filtered out are  203256


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [5]:
Q1_text = df_filtered.question1.values
Q2_text = df_filtered.question2.values
print(type(Q1_text),Q1_text.shape)

<class 'numpy.ndarray'> (120176,)


In [6]:
print(Q1_text)

['Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'
 'How can I be a good geologist?'
 'How do I read and find my YouTube comments?' ...
 'What should be my strategy to earn coins in MiniClip 8 Ball Pool?'
 'What is the best question one has ever come across in Quora?'
 'How will black money and corruption be stopped by banning 500 and 1000 notes?']


In [7]:
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(np.concatenate((Q1_text,Q2_text)))
vocab = tokenizer.word_index
print(len(vocab))

28986


In [8]:
Q1_train = tokenizer.texts_to_sequences(Q1_text)

Q2_train = tokenizer.texts_to_sequences(Q2_text)


In [9]:
split = int(0.8 * len(Q1_train))
Q1_val = Q1_train[split:]
Q1_train = Q1_train[:split]
Q2_val = Q2_train[split:]
Q2_train = Q2_train[:split]
print(Q1_train[:2])

[[1509, 6, 160, 10, 8207, 942, 5463, 658, 14, 5463, 7838, 3, 24, 32, 479, 45, 94], [5, 13, 6, 23, 10, 46, 16437]]


In [10]:
Q1_train =pad_sequences(Q1_train,padding='post',maxlen=64)
Q1_val =pad_sequences(Q1_val,padding='post',maxlen=64)
Q2_train =pad_sequences(Q2_train,padding='post',maxlen=64)
Q2_val =pad_sequences(Q2_val,padding='post',maxlen=64)
print(Q1_train.shape,Q1_val.shape,Q2_train.shape,Q2_val.shape)

(96140, 64) (24036, 64) (96140, 64) (24036, 64)


In [11]:
def stream(q1,q2,batch_size=128):
    idx = 0
    while True:
        k = 0
        indexes = list(range(len(q1)))
        b1,b2 = [],[]
        while k< batch_size:
            if idx >= len(q1):
                idx = 0
                np.random.shuffle(indexes)
            Q1 = q1[indexes[idx]]
            Q2 = q2[indexes[idx]]
            idx += 1
            k += 1
            b1.append(Q1)
            b2.append(Q2)
        
        yield (np.array(b1),np.array(b2))

In [12]:
res1,res2 = next(stream(Q1_train,Q2_train,batch_size=2))
print('Fist Question \n',res1)
print('Second Question \n',res2)

Fist Question 
 [[ 1509     6   160    10  8207   942  5463   658    14  5463  7838     3
     24    32   479    45    94     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [    5    13     6    23    10    46 16437     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]
Second Question 
 [[  261    10  3290  8207   942   658    14 12319     9  8207     3    24
    112   479    45    94     0     0     0     0     0     0     0     0
      0     0     0    

In [13]:
train_gen = stream(Q1_train,Q2_train)
eval_gen = stream(Q1_val,Q2_val)

In [14]:
def normalize(x):  # normalizes the vectors to have L2 norm 1
    return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
branch = tl.Serial(
                    tl.Embedding(vocab_size=len(vocab)+1,d_feature=128),
                    tl.LSTM(128),
                    tl.Mean(axis=1),
                    tl.Fn('Normalize', lambda x: normalize(x))
    #   tl.BatchNorm(axis=1)
                )
                
model = tl.Parallel(branch,branch)
model

Parallel_in2_out2[
  Serial[
    Embedding_28987_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_28987_128
    LSTM_128
    Mean
    Normalize
  ]
]

In [15]:
def TripletLossFn(v1, v2, margin=0.0):
    """Custom Loss function.

    Args:
        v1 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        jax.interpreters.xla.DeviceArray: Triplet Loss.
    """
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    
    # use fastnp to take the dot product of the two batches (don't forget to transpose the second argument)
    scores = fastnp.dot(v1,v2.T)  # pairwise cosine sim
    # calculate new batch size
    batch_size = len(scores)
    # use fastnp to grab all postive `diagonal` entries in `scores`
    positive = fastnp.diagonal(scores)  # the positive ones (duplicates)
    # multiply `fastnp.eye(batch_size)` with 2.0 and subtract it out of `scores`
    negative_without_positive = scores - fastnp.eye(batch_size) * 2.0 
    # take the row by row `max` of `negative_without_positive`. 
    # Hint: negative_without_positive.max(axis = [?])  
    closest_negative = negative_without_positive.max(axis = 1)
    # subtract `fastnp.eye(batch_size)` out of 1.0 and do element-wise multiplication with `scores`
    negative_zero_on_duplicate = (1.0 - fastnp.eye(batch_size)) * scores
    # use `fastnp.sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)` 
    mean_negative = fastnp.sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)
    # compute `fastnp.maximum` among 0.0 and `A`
    # A = subtract `positive` from `margin` and add `closest_negative` 
    triplet_loss1 = fastnp.maximum(margin - positive + closest_negative, 0 )
    # compute `fastnp.maximum` among 0.0 and `B`
    # B = subtract `positive` from `margin` and add `mean_negative`
    triplet_loss2 = fastnp.maximum(margin - positive + mean_negative, 0 )
    # add the two losses together and take the `fastnp.mean` of it
    triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2)
    
    ### END CODE HERE ###
    
    return triplet_loss


In [16]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

In [17]:
def TrainModel(Model,train_generator,eval_generator,TripletLoss,out_dir='./model'):
    
    #lr_schedule = trax.lr.warmup_and_rsqrt_decay(500,0.01)
    
    train_task = trax.supervised.training.TrainTask(labeled_data=train_generator, 
                                           loss_layer=TripletLoss(),
                                           optimizer=trax.optimizers.Adam(learning_rate=0.01),
                         #                  lr_schedule= lr_schedule,
                                           n_steps_per_checkpoint= 50)
    eval_task = trax.supervised.training.EvalTask(labeled_data=eval_generator, 
                                         metrics=[TripletLoss()])
    
    training_loop = trax.supervised.training.Loop(Model,
                                                  tasks = [train_task],
                                                  eval_tasks=[eval_task],
                                                  output_dir=out_dir)
    return training_loop

In [18]:
!rm -rf 'model'
loop = TrainModel(model,train_gen,eval_gen,TripletLoss)

In [19]:
loop.run(2000)


Step      1: Total number of trainable weights: 3841920
Step      1: Ran 1 train steps in 2.93 secs
Step      1: train TripletLoss |  0.49999785
Step      1: eval  TripletLoss |  0.49999791

Step     50: Ran 49 train steps in 2.99 secs
Step     50: train TripletLoss |  0.49998072
Step     50: eval  TripletLoss |  0.49999845

Step    100: Ran 50 train steps in 2.15 secs
Step    100: train TripletLoss |  0.49997699
Step    100: eval  TripletLoss |  0.49999872

Step    150: Ran 50 train steps in 2.22 secs
Step    150: train TripletLoss |  0.49999672
Step    150: eval  TripletLoss |  0.49999535

Step    200: Ran 50 train steps in 2.36 secs
Step    200: train TripletLoss |  0.49999416
Step    200: eval  TripletLoss |  0.49999815

Step    250: Ran 50 train steps in 2.44 secs
Step    250: train TripletLoss |  0.49999428
Step    250: eval  TripletLoss |  0.49999604

Step    300: Ran 50 train steps in 2.36 secs
Step    300: train TripletLoss |  0.49998459
Step    300: eval  TripletLoss |  0.49

In [39]:
df_test = df_test.iloc[:80384]

In [40]:
df_test.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [41]:
df_test.fillna('<UNK>',inplace=True)
df_test.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [42]:
Q1_test = df_test.question1.values
Q2_test = df_test.question2.values
Y       = df_test.is_duplicate.values

In [43]:
Q1_test = tokenizer.texts_to_sequences(Q1_test)
Q2_test = tokenizer.texts_to_sequences(Q2_test)

In [44]:
Q1_test = pad_sequences(Q1_test,maxlen=64,padding='post',truncating='post')
Q2_test = pad_sequences(Q2_test,maxlen=64,padding='post',truncating='post')

In [26]:
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator=stream, batch_size=64):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions.
        test_Q2 (numpy.ndarray): Array of Q2 questions.
        y (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold.
        model (trax.layers.combinators.Parallel): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        data_generator (function): Data generator function. Defaults to data_generator.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model.
    """
    accuracy = 0
    
    for i in range(0, len(test_Q1), batch_size):
        
        
        q1, q2 = next(data_generator(test_Q1[i: i + batch_size], test_Q2[i: i+batch_size], batch_size, pad=vocab['<PAD>'], shuffle=False))
        

        y_test = y[i: i + batch_size]
        
        v1, v2 = model((q1,q2))

        for j in range(batch_size):
       
            d = fastnp.dot(v1[j],v2[j].T)
            
            res = d > threshold
            
            accuracy += (y_test[j] == res)
    
    accuracy = accuracy / len(test_Q1)
    
    return accuracy

In [45]:
pred = classify(Q1_test,Q2_test,Y,0.75,model,vocab)

In [47]:
accu = 0
for x,y in zip(Y,pred):
    if x==y:
        accu += 1
accu/len(Q1_test)       

0.7294610867834395