<a href="https://colab.research.google.com/github/gupta24789/siamese-networks/blob/main/siamese_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !wget https://github.com/gupta24789/siamese-networks/raw/main/data.zip
# !unzip data.zip

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [None]:
import random
import pandas as pd
import numpy as np
import itertools
from nltk import tokenize
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import DataLoader, Dataset


import pytorch_lightning as pl

## Set Seed

In [None]:
seed = 34
random.seed(seed)
torch.manual_seed(seed)
pl.seed_everything(seed)

Seed set to 34


34

## Read Data

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
test_df = test_df.dropna().reset_index(drop = True)

print(f'train shape : {train_df.shape}')
print(f'test shape : {test_df.shape}')

train shape : (283045, 6)
test shape : (121305, 6)


In [None]:
train_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,27186,54210,54211,What will happen if Google starts charging for...,Is it normal to Google search every question y...,0
1,246439,485308,485309,Why are bats associated with vampires?,Do vampires get periods?,0
2,298392,586093,586094,How can I start learning data science?,How can I start learning data science and beco...,1


In [None]:
test_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,291513,572739,572740,What Rolling Stone song has the lyrics “oh and...,What's the point in living if you're so depres...,0
1,58652,116675,76915,Do you have a Business-Plan to help the poor p...,Suppose you run a mobile business which genera...,0
2,118294,139357,234460,Where's a good university to study Computer Sc...,What are some good UK universities in computer...,1


## Prepare Data


The data is setup so that $v_{1\_1}$ and $v_{2\_1}$ represent duplicate inputs, but they are not duplicates with any other rows in the batch. This means $v_{1\_1}$ and $v_{2\_1}$ (green and green) have more similar vectors than say $v_{1\_1}$ and $v_{2\_2}$ (green and magenta).

<img src = 'images/v1v2_stacked.png' width="width" height="height" style="height:250px;"/>

In [None]:
## Create vocab using duplicates questions only
Q1_train_sents = train_df.loc[train_df.is_duplicate==1,'question1'].tolist()
Q2_train_sents = train_df.loc[train_df.is_duplicate==1,'question2'].tolist()

In [None]:
Q1_train_sents[:3]

['How can I start learning data science?',
 'Which is the best book to understand tensors?',
 'Who viewed my profile on Instagram?']

In [None]:
Q2_train_sents[:3]

['How can I start learning data science and become master in it?',
 'Which is the best book to study TENSOR for general relativity from basic?',
 'Can people see if you have viewed their instagram?']

In [None]:
## merge q1 & q2
train_sents = Q1_train_sents + Q2_train_sents
tokens = [tokenize.word_tokenize(sent) for sent in train_sents]
tokens = list(set(itertools.chain.from_iterable(tokens)))

special_tokens = ['__PAD__', '__UNK__']
tokens = special_tokens + tokens

vocab = {w:i for i,w in enumerate(tokens)}
idx2word = {i:w for w,i in vocab.items()}

UNK_ID = vocab['__UNK__']
PAD_ID = vocab['__PAD__']

print(f"Vocab {len(vocab)}")
print(f"PAD ID : {PAD_ID}")
print(f"UNK ID : {UNK_ID}")

Vocab 35289
PAD ID : 0
UNK ID : 1


In [None]:
def encode_sent_to_number(sent):
    sent_list = tokenize.word_tokenize(sent)
    encoded_sent = []

    for w in sent_list:
        encoded_sent.append(vocab.get(w, UNK_ID))

    return encoded_sent

In [None]:
Q1_train_encoded = [encode_sent_to_number(sent) for sent in Q1_train_sents]
Q2_train_encoded = [encode_sent_to_number(sent) for sent in Q2_train_sents]

In [None]:
Q1_test_encoded = [encode_sent_to_number(sent) for sent in test_df.question1.tolist()]
Q2_test_encoded = [encode_sent_to_number(sent) for sent in test_df.question2.tolist()]
y_test = test_df.is_duplicate.tolist()

## Data Loader

In [None]:
def custom_collate(batch):

    q1 = [torch.tensor(item[0]) for item in batch]
    q1_lengths = torch.tensor([len(item[0]) for item in batch] )


    q2 = [torch.tensor(item[1]) for item in batch]
    q2_lengths = torch.tensor([len(item[1]) for item in batch])

    padded_q1 = pad_sequence(q1, batch_first= True, padding_value= PAD_ID)
    padded_q2 = pad_sequence(q2, batch_first= True, padding_value= PAD_ID)

    batch = {"q1": padded_q1, "q2": padded_q2,"q1_lengths": q1_lengths, "q2_lengths": q2_lengths}
    return batch

In [None]:
batch_size = 3
train_dl = DataLoader(list(zip(Q1_train_encoded,Q2_train_encoded)), batch_size = batch_size, shuffle=False, collate_fn= custom_collate)
example = next(iter(train_dl))
example['q1'].shape, example['q2'].shape, example['q1_lengths'].shape, example['q2_lengths'].shape

(torch.Size([3, 9]), torch.Size([3, 14]), torch.Size([3]), torch.Size([3]))

In [None]:
example['q1']

tensor([[14883, 14283,  5318,   852,   850, 19756, 27515, 25929,     0],
        [ 1071, 14812, 13212, 23460, 15598, 13934, 31872,  3933, 25929],
        [13617, 25576,   959,  9670, 12207,  3173, 25929,     0,     0]])

In [None]:
example['q1_lengths']

tensor([8, 9, 7])

In [None]:
## dataloaders
batch_size = 256
train_dl = DataLoader(list(zip(Q1_train_encoded,Q2_train_encoded)), batch_size = batch_size, shuffle=True, collate_fn= custom_collate)
val_dl = DataLoader(list(zip(Q1_test_encoded,Q2_test_encoded)), batch_size = batch_size, shuffle=False, collate_fn= custom_collate)

### Pretained Embeddding

In [None]:
def load_pretrain_emb(filepath):
    lines = open(filepath,"r").readlines()
    embedd_dict = {}
    for line in lines:
        if len(line)>0:
            tokens = line.strip().split(" ")
            word = tokens[0]
            vec = tokens[1:]
            vec = np.array(vec).astype(float)
            embedd_dict[word]= vec

    return embedd_dict

def build_pretrain_embedding(filepath, vocab, emb_dim):
    embedd_dict = load_pretrain_emb(filepath)

    df_list = []

    for w,i in vocab.items():
        if w in embedd_dict:
            df_list.append(torch.tensor(embedd_dict[w]))
        elif w.lower() in embedd_dict:
            df_list.append(embedd_dict[w.lower()])
        else:
            random_vec = np.random.normal(size = (emb_dim))
            df_list.append(random_vec)


    return torch.tensor(df_list)



weights = build_pretrain_embedding("embeddings/glove.6B.100d.txt", vocab, emb_dim=100)
weights.shape

  return torch.tensor(df_list)


torch.Size([35289, 100])

## Model

You will now implement the `TripletLoss`.<br>
As explained in the lecture, loss is composed of two terms. One term utilizes the mean of all the non duplicates, the second utilizes the *closest negative*. Our loss expression is then:

\begin{align}
 \mathcal{Loss_1(A,P,N)} &=\max \left( -cos(A,P)  + mean_{neg} +\alpha, 0\right) \\
 \mathcal{Loss_2(A,P,N)} &=\max \left( -cos(A,P)  + closest_{neg} +\alpha, 0\right) \\
\mathcal{Loss(A,P,N)} &= mean(Loss_1 + Loss_2) \\
\end{align}

In [None]:
class SiameseModel(pl.LightningModule):

    def __init__(self, vocab_size, emb_dim, hidden_dim, margin, threshold, learning_rate, bidirectional, dropout, num_layers, use_pretrained):
        super().__init__()
        self.bidirectional = bidirectional
        self.use_pretrained = use_pretrained
        self.learning_rate = learning_rate
        self.margin = margin
        self.threhold = threshold
        self.val_loss = []

        # layers
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD_ID)
        if use_pretrained:
            self.embedding.weight.data.copy_(weights)
        else:
            self.embedding.weight.data.copy_(torch.from_numpy(self.random_embedding(vocab_size, emb_dim)))

        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first = True, num_layers = num_layers, bidirectional = bidirectional, dropout = dropout)
        self.linear = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)
        self.relu = nn.ReLU()

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(1, vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def loss_fn(self, scores):

        device = "cuda" if scores.is_cuda else "cpu"
        batch_size = len(scores)
        positive = torch.diagonal(scores)
        negative_zero_on_duplicate = scores * (1.0 - torch.eye(batch_size)).to(device)
        mean_negative = torch.sum(negative_zero_on_duplicate, axis=1) / (batch_size-1)

        negative_without_positive = scores - 2.0 * torch.eye(batch_size).to(device)
        closest_negative, _ = negative_without_positive.max(axis=1)

        # print(f"scores : \n{scores}")
        # print("positive : ", positive)
        # print(f"neg : {negative_zero_on_duplicate}")
        # print("mean_negative : ", mean_negative)
        # print("closest_negative : ", closest_negative)

        triplet_loss1 = torch.maximum(torch.tensor(0.0), self.margin - positive + closest_negative)
        triplet_loss2 = torch.maximum(torch.tensor(0.0), self.margin - positive + mean_negative)
        triplet_loss = torch.mean(triplet_loss1 + triplet_loss2)
        return triplet_loss

    def get_normalize_vector(self, q, q_lengths):
        out = self.embedding(q)
        ## out : [batch size, sent len, emb dim]
        packed_input = pack_padded_sequence(out, lengths= q_lengths.to('cpu'), batch_first = True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        out, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first = True)
        ## out : [batch size, sent len, hidden dim * bidirectional]
        # hidden : [num_layers * bidirectional, batch size,hidden dim]

        if self.bidirectional:
            out = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        else:
            out = hidden[-1,:,:]

        ## out : [batch size, hidden_dim * bidirectional]
        # out = torch.mean(out, dim = 1)
        ## [batch size, hidden dim * bidirectional]
        # out = self.relu(self.linear(out))
        # print(out.shape)
        out = F.normalize(out, p = 2)

        return out

    def forward(self, q1, q2, q1_lengths, q2_lengths):
        q1_vec = self.get_normalize_vector(q1, q1_lengths)
        # print(q1_vec.shape)
        q2_vec = self.get_normalize_vector(q2, q2_lengths)
        # print(q2_vec.shape)
        return (q1_vec, q2_vec)

    def training_step(self, batch):
        q1,q2,q1_lengths, q2_lengths = batch['q1'], batch['q2'],batch['q1_lengths'], batch['q2_lengths']
        q1_vec,q2_vec = self(q1,q2,q1_lengths, q2_lengths)
        scores = torch.matmul(q1_vec, q2_vec.T)
        loss = self.loss_fn(scores)
        self.log_dict({"train_loss": loss}, on_step = False, on_epoch = True, prog_bar=True)
        return loss

    def validation_step(self, batch):
        q1,q2,q1_lengths, q2_lengths = batch['q1'], batch['q2'],batch['q1_lengths'], batch['q2_lengths']
        q1_vec,q2_vec = self(q1,q2,q1_lengths, q2_lengths)
        scores = torch.matmul(q1_vec, q2_vec.T)
        loss = self.loss_fn(scores)
        self.val_loss.append(loss.item())
        self.log_dict({"val_loss": loss}, on_step = False, on_epoch = True, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        print(f"Current Epoch : {self.current_epoch} Valiadtion Loss : {np.mean(self.val_loss)}")
        self.val_loss = []

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr = self.learning_rate)
        return optimizer

In [None]:
## test model architecture
model = SiameseModel(len(vocab), emb_dim = 100, hidden_dim = 150, margin = 0.25, threshold = 0.7, learning_rate = 1e-3,
                     bidirectional= False, num_layers = 1, dropout = 0.25, use_pretrained = False)
print(model)
q1, q2, q1_lengths, q2_lengths = example['q1'], example['q2'],example['q1_lengths'],example['q2_lengths']
v1,v2 = model(q1, q2, q1_lengths, q2_lengths)
scores = torch.matmul(v1,v2.T)
loss = model.loss_fn(scores)
print(loss)
print(scores)

SiameseModel(
  (embedding): Embedding(35289, 100, padding_idx=0)
  (lstm): LSTM(100, 150, batch_first=True, dropout=0.25)
  (linear): Linear(in_features=150, out_features=150, bias=True)
  (relu): ReLU()
)
tensor(0.4927, grad_fn=<MeanBackward0>)
tensor([[0.9736, 0.9567, 0.9573],
        [0.9556, 0.9637, 0.9618],
        [0.9681, 0.9634, 0.9581]], grad_fn=<MmBackward0>)




In [None]:
## Model Training
model= SiameseModel(len(vocab), emb_dim = 100, hidden_dim = 150, margin = 0.25, threshold = 0.7,
                    learning_rate = 1e-3, bidirectional= False, num_layers = 1, dropout = 0.5,use_pretrained = False)

callbacks = pl.callbacks.ModelCheckpoint(dirpath = "checkpoints_logs",
                                         filename = '{epoch}-{val_loss:.2f}',
                                          mode = "min",
                                          monitor = "val_loss",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=5,
           check_val_every_n_epoch = 1,
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/saurabh/mydata/checkpoints_logs exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name      | Type      | Params
-----------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Current Epoch : 0 Valiadtion Loss : 0.49436189234256744


/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Current Epoch : 0 Valiadtion Loss : 0.2566767697721594


Validation: |          | 0/? [00:00<?, ?it/s]

Current Epoch : 1 Valiadtion Loss : 0.2353735192038339


Validation: |          | 0/? [00:00<?, ?it/s]

Current Epoch : 2 Valiadtion Loss : 0.22896539784559217


Validation: |          | 0/? [00:00<?, ?it/s]

Current Epoch : 3 Valiadtion Loss : 0.22397989572226246


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


Current Epoch : 4 Valiadtion Loss : 0.22066892089345788


## Test the accuracy

In [None]:
model = model.eval()

total = 0
correct = 0
total = 0
threshold = 0.7
y_pred_list = []
y_true_list = []
device = "cuda"

model.to(device)

for batch in tqdm_notebook(val_dl):
    q1,q2, q1_lengths, q2_lengths = batch['q1'], batch['q2'],batch['q1_lengths'], batch['q2_lengths']
    batch_size = q1.shape[0]

    q1 = q1.to(device)
    q2 = q2.to(device)
    v1,v2 = model(q1,q2,q1_lengths, q2_lengths)
    v1.to(device)
    v2.to(device)
    scores = torch.matmul(v1, v2.T)
    res = torch.diag(scores)
    y_pred = (res>threshold).long()
    y_pred = y_pred.cpu().numpy()
    y_true = torch.tensor(y_test[total: total + batch_size])
    y_true = y_true.cpu().numpy()

    y_pred_list += list(y_pred)
    y_true_list += list(y_true)
    correct += sum(y_true == y_pred)
    total += len(y_pred)

print(f"Test Accuracy : {correct/total}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(val_dl):


  0%|          | 0/474 [00:00<?, ?it/s]

Test Accuracy : 0.7442397263097151


In [None]:
print(classification_report(y_true_list, y_pred_list))

              precision    recall  f1-score   support

           0       0.85      0.73      0.78     76513
           1       0.62      0.77      0.69     44792

    accuracy                           0.74    121305
   macro avg       0.73      0.75      0.74    121305
weighted avg       0.76      0.74      0.75    121305



In [None]:
print(confusion_matrix(y_true=y_true_list, y_pred = y_pred_list))

[[55608 20905]
 [10120 34672]]


## Predict

In [None]:
model = model.eval()
model.to("cpu")

def predict(q1, q2, threshold = 0.7):

    q1_words = tokenize.word_tokenize(q1)
    q2_words = tokenize.word_tokenize(q2)

    q1_encoded = [vocab.get(w, UNK_ID) for w in q1_words]
    q2_encoded = [vocab.get(w, UNK_ID) for w in q2_words]

    q1_len = len(q1_encoded)
    q2_len = len(q2_encoded)

    q1 = torch.tensor(q1_encoded).view(1,-1)
    q2 = torch.tensor(q2_encoded).view(1,-1)

    q1_lengths = torch.tensor([q1.shape[1]], dtype = torch.long)
    q2_lengths = torch.tensor([q2.shape[1]], dtype = torch.long)

    v1, v2 = model(q1, q2, q1_lengths, q2_lengths)
    score = torch.matmul(v1, v2.T)
    if score > threshold:
        return ("Duplicated", score.item())
    else:
        return ("Not Duplicated", score.item())

In [None]:
question1 = "When will I see you?"
question2 = "When can I see you again?"
predict(question1 , question2)

('Duplicated', 0.7624601125717163)

In [None]:
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
predict(question1 , question2)

('Not Duplicated', 0.24493733048439026)