# Imports

In [None]:
import numpy as np
import pandas as pd

In [None]:
%pip install --upgrade transformers datasets accelerate deepspeed 
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import datasets

In [None]:
from tqdm.autonotebook import tqdm
from collections import Counter
import nltk
import gensim.downloader
from IPython.display import clear_output
import matplotlib.pyplot as plt

# Read data

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
train = datasets.load_dataset('csv', data_files='/content/drive/MyDrive/data/qqp/train.csv')['train']
test = datasets.load_dataset('csv', data_files='/content/drive/MyDrive/data/qqp/test.csv')['train']

# Functions

In [None]:
def token_counter(train, tokenizer):
    token_counts = Counter()
    for pair in train:
        if pair['question1']:
          token_counts.update(tokenizer.tokenize(pair['question1']))
        else:
          token_counts.update(tokenizer.tokenize('Not a question'))
        if pair['question2']:
          token_counts.update(tokenizer.tokenize(pair['question2']))
        else:
          token_counts.update(tokenizer.tokenize('Not a question'))
    return token_counts


def get_tokens(token_counts, min_c, max_c):
    return sorted(t for t, c in token_counts.items() if c >= min_c and c <= max_c)


def get_embeddings(tokens, name, emb_size, PAD = '<pad>', UNK = '<unk>'):
    embeddings = gensim.downloader.load(name)
    vocab_npa = np.array(tokens)
    embs_npa = np.array([embeddings.get_vector(x) if x in list(embeddings.index_to_key) else np.zeros(emb_size) for x in tokens])

    vocab_npa = np.insert(vocab_npa, 0, PAD)
    vocab_npa = np.insert(vocab_npa, 1, UNK)
    pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   
    unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True) 
    embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
    return embs_npa, vocab_npa

In [None]:
MAX_LENGTH = 40
def tokenize_nltk(qqp, tokenizer):
    def preprocess_function(examples):
        result = {}
        result['q1'] = [token_to_id.get(x,1) for x in tokenizer.tokenize(
            examples['question1'] if examples['question1'] is not None else 'Not a question', 
        )]
        result['q1'] = result['q1'][:MAX_LENGTH]
        result['q1'] = result['q1'] + [0] * (MAX_LENGTH - len(result['q1']))
        result['q2'] = [token_to_id.get(x,1) for x in tokenizer.tokenize(
            examples['question2'] if examples['question2'] is not None else 'Not a question', 
        )]
        result['q2'] = result['q2'][:MAX_LENGTH]
        result['q2'] = result['q2'] + [0] * (MAX_LENGTH - len(result['q2']))
        result['label'] = examples['is_duplicate']
        return result

    qqp_preprocessed = qqp.map(preprocess_function)
    return qqp_preprocessed

In [None]:
def train_model(model, criterion, optimizer, calculate_loss, calculate_val_loss, scheduler=None, *args):
    train_history = []
    dev_history = []
    count = 0
    for epoch in range(EPOCHS):
        print(f"epoch: {epoch}")
        model.train()
        train_loader = torch.utils.data.DataLoader(
                  train_preprocessed, 
                  batch_size=512, 
                  shuffle=True, 
                  collate_fn=transformers.default_data_collator, 
                  num_workers=2) 
        for i, batch in enumerate(tqdm(train_loader)):
            count += 1
            pred = model(batch, *args)
            loss = calculate_loss(criterion, pred, batch['labels'])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_history.append((count, loss.item()))
            if (count + 1) % 100 == 0:
                clear_output(True)
                plt.scatter(*zip(*train_history), alpha=0.1, label='train_loss')
                if len(dev_history):
                    plt.plot(*zip(*dev_history), color='red', label='dev_loss')
                plt.legend(); plt.grid(); plt.show()
            if (count + 1) % 300 == 0:
                print("Scoring dev...")
                val_loader = torch.utils.data.DataLoader(
                            val_preprocessed, 
                            batch_size=32, 
                            shuffle=False, 
                            collate_fn=transformers.default_data_collator, 
                            num_workers=2
                        ) 
                dev_history.append((count, calculate_val_loss(model, val_loader, criterion)))
                print('#%i Dev loss: %.3f' % dev_history[-1])
                
        scheduler.step() 
    return dev_history[-1]

# embeddings model

In [None]:
train = train.train_test_split(test_size = 0.1)

In [None]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
token_counts = token_counter(train['train'], tokenizer)
tokens = get_tokens(token_counts, 50, 10000)

In [None]:
embs_npa, vocab_npa = get_embeddings(tokens, name='glove-wiki-gigaword-300', emb_size=300)
token_to_id = {val: idx for idx, val in enumerate(vocab_npa)}

In [None]:
train_preprocessed = tokenize_nltk(train['train'], tokenizer)
val_preprocessed = tokenize_nltk(train['test'], tokenizer)

In [None]:
class EmbeddingModel(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                 emb_len=300):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float()).requires_grad_(True)

    def forward(self, batch):
        q1 = batch['q1'].to(device)
        q2 = batch['q2'].to(device)

        q1 = torch.mean(self.emb(q1), dim=1)
        q2 = torch.mean(self.emb(q2), dim=1)
        
        return nn.functional.cosine_similarity(q1, q2)

In [None]:
def calculate_val_loss_emb_model(model, val_loader, criterion, device=device):
    loss_list = []
    for _, batch in enumerate(tqdm(val_loader)):
        with torch.no_grad():
            predicted = model(batch)
            batch_loss = criterion(predicted, batch['labels'].to(device).float())
            loss_list.append(batch_loss.item())
    loss = np.mean(loss_list)
    return loss

    
def calculate_loss_emb_model(criterion, pred, y_true):
    return criterion(pred, y_true.to(device).float())


In [None]:
criterion = nn.BCEWithLogitsLoss()
val_loader = torch.utils.data.DataLoader(
                            val_preprocessed, 
                            batch_size=32, 
                            shuffle=False, 
                            collate_fn=transformers.default_data_collator, 
                            num_workers=2
                        ) 
calculate_val_loss_emb_model(EmbeddingModel().to(device), val_loader, criterion)

In [None]:
EPOCHS = 15
model = EmbeddingModel().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=8e-5, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

train_model(model,
            criterion, 
            optimizer,
            calculate_loss_emb_model,
            calculate_val_loss_emb_model, 
            scheduler)

# Siamese LSTM

In [None]:
train = train.train_test_split(test_size = 0.1)

In [None]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
token_counts = token_counter(train['train'], tokenizer)
tokens = get_tokens(token_counts, 50, 10000)

In [None]:
embs_npa, vocab_npa = get_embeddings(tokens, name='glove-wiki-gigaword-300', emb_size=300)
token_to_id = {val: idx for idx, val in enumerate(vocab_npa)}

In [None]:
class ContrastiveLoss(nn.Module):
    def __init__(self, delta=0):
        super().__init__()
        self.delta = delta

    def __call__(self, x, ans):
        q1, q2 = x
        ans = ans.squeeze()
        dist = torch.norm(q1 - q2, dim=1)
        loss = torch.pow(dist[ans == 1].sum() / 2.0, 2)
        temp = torch.max(torch.tensor([torch.tensor(0.0), torch.pow((self.delta - dist[ans == 0].sum())/ 2.0, 2)]))
        loss += temp
        return loss

In [None]:
class AbsoluteDistance(nn.Module):
    def __init__(self):
        super().__init__()

    def __call__(self, q1, q2):
        return torch.abs(q1-q2)

class Identity(nn.Module):
    def __init__(self):
        super().__init__()

    def __call__(self, q1, q2):
        return q1, q2

class Concat(nn.Module):
    def __init__(self):
        super().__init__()

    def __call__(self, q1, q2):
        return torch.cat((q1,q2), dim=1)


class SiameseLSTM(nn.Module):
    def __init__(self, n_tokens=len(tokens), 
                 emb_len=300, 
                 rec = 80, 
                 num_layers = 1,
                 func = AbsoluteDistance,
                 fc = True,
                 cat= False):
        super().__init__()
        self.rec = rec
        self.num_layers = num_layers
        self.emb = nn.Embedding.from_pretrained(
            torch.from_numpy(embs_npa).float()).requires_grad_(True)
        self.lstm = nn.LSTM(input_size=emb_len, 
                            hidden_size=rec, 
                            num_layers=num_layers,
                            batch_first=True)
        self.bn1 = nn.BatchNorm1d(rec + cat*rec)
        self.bn2 = nn.BatchNorm1d(32)
        self.fc1 = nn.Linear(rec + cat*rec, 32)
        self.fc2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.func = func()
        self.fc_flag = fc
        
    def forward(self, batch):
        q1 = batch['q1'].to(device)
        q2 = batch['q2'].to(device)

        q1 = self.emb(q1)
        hidden, carry = (torch.randn(self.num_layers, len(q1), self.rec), 
                        torch.randn(self.num_layers, len(q1), self.rec))
        hidden, carry =  hidden.to(device), carry.to(device)
        q1, _ = self.lstm(q1, (hidden, carry))
        
        q2 = self.emb(q2)
        hidden1, carry1 = (torch.randn(self.num_layers, len(q2), self.rec),
                           torch.randn(self.num_layers, len(q2), self.rec))
        hidden1, carry1 =  hidden1.to(device), carry1.to(device)
        q2, _ = self.lstm(q2, (hidden1, carry1))
        
        x = self.func(q1[:,-1], q2[:,-1])
        if self.fc_flag:
            return self.fc2(self.bn2(self.relu(self.fc1(self.bn1(x)))))
        else:
            return x

In [None]:
train_preprocessed = tokenize_nltk(train['train'], tokenizer)
val_preprocessed = tokenize_nltk(train['test'], tokenizer)

In [None]:
def calculate_val_loss(model, val_loader, criterion, device=device):
    loss_list = []
    for _, batch in enumerate(tqdm(val_loader)):
        with torch.no_grad():
            predicted = model(batch)
            batch_loss = criterion(predicted, batch['labels'].to(device).float().unsqueeze(1))
            loss_list.append(batch_loss.item())
    loss = np.mean(loss_list)
    return loss

def calculate_loss(criterion, pred, y_true):
  return criterion(pred, y_true.to(device).float().unsqueeze(1))

In [None]:
val_loader = torch.utils.data.DataLoader(
                            val_preprocessed, 
                            batch_size=32, 
                            shuffle=False, 
                            collate_fn=transformers.default_data_collator, 
                            num_workers=2
                        ) 
calculate_val_loss(SiameseLSTM().to(device), val_loader, criterion, device=device)

In [None]:
EPOCHS = 20 
siamese_model = SiameseLSTM(func = Concat,
                 fc = True,
                 cat= True).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(siamese_model.parameters(), lr=1e-4, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
train_model(siamese_model,
            criterion, 
            optimizer,
            calculate_loss,
            calculate_val_loss, 
            scheduler)

In [None]:
val_loader = torch.utils.data.DataLoader(
                            val_preprocessed, 
                            batch_size=32, 
                            shuffle=False, 
                            collate_fn=transformers.default_data_collator, 
                            num_workers=2
                        ) 
calculate_val_loss(siamese_model, val_loader, nn.BCEWithLogitsLoss(), device=device)