# Duplicate Questions: Siamese Network
- Siamese Network
- Triplet Loss
- Accuracy Evaluation
- Cosine Similarity between Model's Vector
- Data Generators for Question Batches
- Predict

In [1]:
import os
import nltk
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import numpy as np
import pandas as pd
import random as rnd

# Set random seeds
trax.supervised.trainer_lib.init_random_number_generators(34)
rnd.seed(34)

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 




In [2]:
data = pd.read_csv("questions.csv")
N = len(data)
print(f'Number of question pairs: {N}')
data.head()

Number of question pairs: 404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
N_train = 300000
N_test = 10 * 1024
data_train = data[:N_train]
data_test = data[N_train:N_train+N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del(data) # remove to free memory

Train set: 300000 Test set: 10240


In [4]:
td_index = (data_train['is_duplicate'] == 1).to_numpy()
td_index = [i for i, x in enumerate(td_index) if x]

print('number of duplicate questions: ', len(td_index))
print('indexes of first ten duplicate questions:', td_index[:10])

number of duplicate questions:  111486
indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [5]:
print(data_train['question1'][5])  #  Example of question duplicates (first one in data)
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


In [6]:
print(data_train['question1'][25])  #  Example of question duplicates (first one in data)
print(data_train['question2'][25])
print('is_duplicate: ', data_train['is_duplicate'][25])

What are some tips on making it through the job interview process at Medicines?
What are some tips on making it through the job interview process at Foundation Medicine?
is_duplicate:  0


In [7]:
print(data_train['question1'][125])  #  Example of question duplicates (first one in data)
print(data_train['question2'][125])
print('is_duplicate: ', data_train['is_duplicate'][125])

Why does China block sanctions at the UN against the Jaish-e-Mohammad (JeM) chief, Masood Azhar?
Why does China support Masood Azhar?
is_duplicate:  1


In [8]:
Q1_train_words = np.array(data_train['question1'][td_index])
Q2_train_words = np.array(data_train['question2'][td_index])

Q1_test_words = np.array(data_test['question1'])
Q2_test_words = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [9]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train_words[0])
print('Question 2: ', Q2_train_words[0], '\n')
print('Question 1: ', Q1_train_words[5])
print('Question 2: ', Q2_train_words[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test_words[0])
print('Question 2: ', Q2_test_words[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



- Encode each word of the selected duplicate pairs with an index
- It is encoded as list of numbers
- Tokenize the question using `nltk.word.tokenize`
- Use a dictionary, during inference, assing 0 to alll out of vocab OOV words
- Encode each word of the selected duplicate pairs with an index
- Given a question, you can encode it as a list of numbers


In [10]:
# Create Arrays
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

In [11]:
Q1_train
print(Q1_train.shape)
print(type(Q1_train))

(111486,)
<class 'numpy.ndarray'>


In [12]:
# Building vocabulary with the train set
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

vocab

defaultdict(<function __main__.<lambda>()>, {'<PAD>': 1})

In [13]:
for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])
    
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print(f'The length of the vocabulary is {len(vocab)}')

The length of the vocabulary is 36306


In [14]:
print(vocab['<PAD>'])
print(vocab['Astrology'])
print(vocab['Astronomy'])  #not i

1
2
0


In [15]:
for idx in range(len(Q1_test_words)): 
    Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx])
    Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx])

In [16]:
print('Train set has reduced to: ', len(Q1_train) ) 
print('Test set length: ', len(Q1_test) ) 

Train set has reduced to:  111486
Test set length:  10240


## Questions to Tensors: CONVERT

In [17]:
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]

        
for i in range(len(Q1_test)):
    Q1_test[i] = [vocab[word] for word in Q1_test[i]]
    Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [18]:
print('first question in the train set:\n')
print(Q1_train_words[0], '\n') 
print('encoded version:')
print(Q1_train[0],'\n')

print('first question in the test set:\n')
print(Q1_test_words[0], '\n')
print('encoded version:')
print(Q1_test[0]) 

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
[32, 38, 4, 107, 65, 1015, 65, 11514, 21]


In [19]:
# Splitting the data
cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


## Understanding the Iterator
- In AI/NLP, we uses batches of data while training
- SGD with one example, will train for ever
- Build a generator takes Q1 and Q2 and returns batches in the format $([q1_1, q1_2, q1_3, ...]$, $[q2_1, q2_2,q2_3, ...])$
- The tuple consists of two arrays and each array has `batch_size` questions
- $q1_i$ and $q2_i$ are duplicates, but they are not duplicates with any other elements in the batch.

In [20]:
def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):
    """
    Q1: List of transformed questions(tensors)
    Q2: List of transformed questions(tensors)
    batch_size: Num elems per batch
    pad(int): Pad char
    shuffle(bool): If the batchs should be randomized or not. 
    
    Yields:
    tuple: Of the form (input1, input2) with types (numpy.ndarray, numpy.ndarray)
    NOTE: input1: inputs to your model [q1a, q2a, q3a, ...] i.e. (q1a,q1b) are duplicates
              input2: targets to your model [q1b, q2b,q3b, ...] i.e. (q1a,q2i) i!=a are not duplicates
    """
    
    input1 = []
    input2 = []
    
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
        
    while(True):
        if(idx >= len_q):
            idx = 0
            
        if(shuffle):
            rnd.shuffle(question_indexes)
            
        # Get questions at the `question_indexes[idx]` position in Q1 and Q2
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        
        idx += 1
        input1.append(q1)
        input2.append(q2)
        
        if(len(input1) == batch_size):
            max_1 = max([len(input1[i]) for i in range(batch_size)])
            max_2 = max([len(input2[i]) for i in range(batch_size)])
            max_len = max(max_1, max_2)
            max_len = 2**int(np.ceil(np.log2(max_len)))
            
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):
                q1 = q1 + ((max_len - len(q1)) * [pad])
                q2 = q2 + ((max_len - len(q2)) * [pad])
                
                b1.append(q1)
                # append q2
                b2.append(q2)
                
            yield np.array(b1), np.array(b2)
            
            input1, input2 = [], []

In [21]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))
print("First questions  : ",'\n', res1, '\n')
print("Second questions : ",'\n', res2)

First questions  :  
 [[   30    87   116   516   517   990    39   389 14767    21     1     1
      1     1     1     1]
 [   30    87    78   216   685    65  1715   208    21     1     1     1
      1     1     1     1]] 

Second questions :  
 [[ 1611    53 22783     6   516   612    17   254   990    39   619   389
   4306   622    21     1]
 [   30    87   116    35   685    65  1715   208    21     1     1     1
      1     1     1     1]]


## Defining a Siamese Network

- Get the question embedding, run it through an LSTM layer, 
- Normalize $v_1$ and $v_2$, and 
- Finally use a triplet loss (explained below) to get the corresponding cosine similarity for each pair of questions. 

As usual, you will start by importing the data set. 
- The triplet loss makes use of a baseline (anchor) input that is compared to a positive (truthy) input and a negative (falsy) input. 
- The distance from the baseline (anchor) input to the positive (truthy) input is minimized, and the distance from the baseline (anchor) input to the negative (falsy) input is maximized. 

In math equations, you are trying to maximize the following.

$$\mathcal{L}(A, P, N)=\max \left(\|\mathrm{f}(A)-\mathrm{f}(P)\|^{2}-\|\mathrm{f}(A)-\mathrm{f}(N)\|^{2}+\alpha, 0\right)$$

$A$ is the anchor input, for example $q1_1$, $P$ the duplicate input, for example, $q2_1$, and $N$ the negative input (the non duplicate question), for example $q2_2$.<br>
$\alpha$ is a margin; you can think about it as a safety net, or by how much you want to push the duplicates from the non duplicates. 
<br>


In [22]:
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):
    
    def normalize(x):
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
    
    q_processor = tl.Serial(
        tl.Embedding(vocab_size=vocab_size, d_feature=d_model),
        tl.LSTM(d_model),
        tl.Mean(axis=1),
        tl.Fn('Normalize', lambda x: normalize(x))
    )
    
    model = tl.Parallel(q_processor, q_processor)
    return model

In [23]:
model = Siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_41748_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_41748_128
    LSTM_128
    Mean
    Normalize
  ]
]


## Hard Negative Mining

You will now implement the `TripletLoss`.<br>
As explained in the lecture, loss is composed of two terms. One term utilizes the mean of all the non duplicates, the second utilizes the *closest negative*. Our loss expression is then:
 
\begin{align}
 \mathcal{Loss_1(A,P,N)} &=\max \left( -cos(A,P)  + mean_{neg} +\alpha, 0\right) \\
 \mathcal{Loss_2(A,P,N)} &=\max \left( -cos(A,P)  + closest_{neg} +\alpha, 0\right) \\
\mathcal{Loss(A,P,N)} &= mean(Loss_1 + Loss_2) \\
\end{align}


In [24]:
def TripletLossFn(v1, v2, margin=0.25):
    # Pairwise cosine similarity
    scores = fastnp.dot(v1, v2.T)
    # Calculate new batch_size
    batch_size = len(scores)
    # The positive ones in the diagonal
    positive = fastnp.diag(scores)
    # multiply `fnp.eye(batch_size) with 2.0 and sub it out of `scores`
    negative_without_positive = scores - fastnp.multiply(fastnp.eye(batch_size), 2.0)
    # Take row by row max of nwp
    closest_negative = negative_without_positive.max(axis=1)
    # Sub fnp.eye(batch_size) out of 1.0 and do element wise mul woth scores
    negative_zero_on_duplicate = fastnp.multiply((1.0 - fastnp.eye(batch_size) ), scores)
    # use fnp.sum on neg_0_on_dup for axis=1 and divide it by batch_size - 1
    mean_negative = fastnp.sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)
    # compute fnp.max among 0.0 and A
    # A = sub positive from margin and add clossest negative
    #print(margin.shape)
    triplet_loss1 = fastnp.maximum(0.0,  margin - positive + closest_negative)
    # Compute fnp.max among 0.0 and B
    # B = sub positive from margin and add mean_negative
    triplet_loss2 = fastnp.maximum(0.0, margin - positive + mean_negative)
    triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2)
    
    return triplet_loss

In [25]:
v1 = np.array([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = np.array([[ 0.26726124,  0.53452248,  0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
TripletLossFn(v2,v1)
print("Triplet Loss:", TripletLossFn(v2,v1))

Triplet Loss: 0.5


In [26]:
from functools import partial

In [27]:
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

## Training
- Train a model 
- Define a cost function
- An optimizer
- Feed data to the model
- Build a dataset using data generator
- Lambda function acts as a seed to remember the last batch that was given


In [28]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape ', train_Q1.shape)
print('val_Q1.shape   ', val_Q1.shape)

train_Q1.shape  (89188,)
val_Q1.shape    (22298,)


In [29]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

def train_model(
    Siamese, TripletLoss, lr_schedule, 
    train_generator=train_generator, 
    val_generator=val_generator, output_dir='model/'
):
    output_dir = os.path.expanduser(output_dir)
    
    train_task = training.TrainTask(
        labeled_data=train_generator,
        loss_layer=TripletLoss(),
        optimizer=trax.optimizers.Adam(0.01),
        lr_schedule=lr_schedule
    )
    
    eval_task = training.EvalTask(
        labeled_data=val_generator,
        metrics=[TripletLoss()]
    )
    
    training_loop = training.Loop(
        Siamese(),
        train_task,
        eval_task=eval_task,
        output_dir=output_dir
    )
    
    return training_loop

In [30]:
train_steps = 5
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(train_steps)

Step      1: train TripletLoss |  0.49900973
Step      1: eval  TripletLoss |  0.49906957


## Evaluation

In [31]:
# Loading in the saved model
model = Siamese()
model.init_from_file('model.pkl.gz')

In [32]:
def classify(
    test_Q1, test_Q2, y, threshold, 
    model, vocab, data_generator=data_generator, 
    batch_size=64
):
    
    accuracy = 0
    for i in range(0, len(test_Q1), batch_size):
        # Call the data generator with shuffle=False using next()
        # Use batch_size chunks of questions as Q1 and Q2 arguments of the data generator
        q1, q2 = next(data_generator(
            test_Q1[i:i+batch_size], test_Q2[i:i+batch_size], 
            batch_size, pad=vocab['<PAD>'], shuffle=False
        ))
        y_test = y[i:i+batch_size]
        v1, v2 = model([q1, q2])
        
        for j in range(batch_size):
            # Take dot product to compute cos similarity of each pair
            # of entries v1[j], v2[j]
            d = fastnp.dot(v1[j], v2[j].T)
            # id d greate than the threshold
            res = d > threshold
            if(y_test[j] == res):
                accuracy += 1
    accuracy = accuracy / y.shape[0]
    
    return accuracy

In [33]:
# this takes around 1 minute
accuracy = classify(Q1_test,Q2_test, y_test, 0.7, model, vocab, batch_size = 512) 
print("Accuracy", accuracy)

Accuracy 0.7044921875


In [41]:
def predict(
    question1, question2, threshold, 
    model, vocab, data_generator=data_generator, 
    verbose=False
):
    q1 = nltk.word_tokenize(question1)
    q2 = nltk.word_tokenize(question2)
    Q1, Q2 = [], []
    for word in q1:
        Q1.append(vocab[word])
    for word in q2:
        Q2.append(vocab[word])
        
    Q1, Q2 = next(data_generator(
        [Q1], [Q2], batch_size, pad=vocab['<PAD>'], shuffle=False
    ))
    v1, v2 = model([Q1, Q2])
    d = fastnp.dot(v1, v2.T)
    res = (d > threshold)
    
    if(verbose):
        print("Q1  = ", Q1, "\nQ2  = ", Q2)
        print("d   = ", d)
        print("res = ", res)

    return res

In [42]:
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose = True)

Q1  =  [[585  76   4 ...  21   1   1]
 [585  76   4 ...  21   1   1]
 [585  76   4 ...  21   1   1]
 ...
 [585  76   4 ...  21   1   1]
 [585  76   4 ...  21   1   1]
 [585  76   4 ...  21   1   1]] 
Q2  =  [[ 585   33    4 ... 7282   21    1]
 [ 585   33    4 ... 7282   21    1]
 [ 585   33    4 ... 7282   21    1]
 ...
 [ 585   33    4 ... 7282   21    1]
 [ 585   33    4 ... 7282   21    1]
 [ 585   33    4 ... 7282   21    1]]
d   =  [[0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]
 [0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]
 [0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]
 ...
 [0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]
 [0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]
 [0.8561271 0.8561271 0.8561271 ... 0.8561271 0.8561271 0.8561271]]
res =  [[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  

DeviceArray([[ True,  True,  True, ...,  True,  True,  True],
             [ True,  True,  True, ...,  True,  True,  True],
             [ True,  True,  True, ...,  True,  True,  True],
             ...,
             [ True,  True,  True, ...,  True,  True,  True],
             [ True,  True,  True, ...,  True,  True,  True],
             [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)

In [40]:
# Feel free to try with your own questions
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose=True)

['Do', 'they', 'enjoy', 'eating', 'the', 'dessert', '?'] ['Do', 'they', 'like', 'hiking', 'in', 'the', 'desert', '?']
Q1  =  [[  443  1145  3158 ... 29039    21     1]
 [  443  1145  3158 ... 29039    21     1]
 [  443  1145  3158 ... 29039    21     1]
 ...
 [  443  1145  3158 ... 29039    21     1]
 [  443  1145  3158 ... 29039    21     1]
 [  443  1145  3158 ... 29039    21     1]] 
Q2  =  [[ 443 1145   60 ...   78 7433   21]
 [ 443 1145   60 ...   78 7433   21]
 [ 443 1145   60 ...   78 7433   21]
 ...
 [ 443 1145   60 ...   78 7433   21]
 [ 443 1145   60 ...   78 7433   21]
 [ 443 1145   60 ...   78 7433   21]]
d   =  [[0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.542211]
 [0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.542211]
 [0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.542211]
 ...
 [0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.542211]
 [0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.542211]
 [0.542211 0.542211 0.542211 ... 0.542211 0.542211 0.54221

DeviceArray([[False, False, False, ..., False, False, False],
             [False, False, False, ..., False, False, False],
             [False, False, False, ..., False, False, False],
             ...,
             [False, False, False, ..., False, False, False],
             [False, False, False, ..., False, False, False],
             [False, False, False, ..., False, False, False]], dtype=bool)