In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from dataset import Collator, Dataset
from metrics import compute_metrics_on_df_contrastive
from model import MeanPooler, SiameseContrastiveBERT
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange
from train_contrastive import train
from transformers import AutoModel, AutoTokenizer
from utils import chunks, set_global_seed

2022-05-31 23:49:40.851700: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':        'distilroberta-base',
    'BATCH_SIZE':        128,
    'LEARNING_RATE':     1e-5,
    'N_EPOCHS':          10,
    'MARGIN':            2,
}

In [5]:
# tensorboard
experiment_name = f"MODEL_CONTRASTIVE_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_MARGIN_{config['MARGIN']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', index_col='id')
df.fillna('', inplace=True)

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [8]:
df_submission = pd.read_csv('./data/test.csv', index_col='test_id')
df_submission.fillna('', inplace=True)

df_submission['question1'] = df_submission['question1'].str.lower()
df_submission['question2'] = df_submission['question2'].str.lower()

In [9]:
df

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0
2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,7,8,why am i mentally very lonely? how can i solve...,find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"which one dissolve in water quikly sugar, salt...",which fish would survive in salt water?,0
...,...,...,...,...,...
404285,433578,379845,how many keywords are there in the racket prog...,how many keywords are there in perl programmin...,0
404286,18840,155606,do you believe there is life after death?,is it true that there is life after death?,1
404287,537928,537929,what is one coin?,what's this coin?,0
404288,537930,537931,what is the approx annual cost of living while...,i am having little hairfall problem but i want...,0


In [10]:
df_submission

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,should i have a hair transplant at age 24? how...,how much cost does hair transplant require?
2,what but is the best way to send money from ch...,what you send money to china?
3,which food not emulsifiers?,what foods fibre?
4,"how ""aberystwyth"" start reading?",how their can i start reading?
...,...,...
2345791,how do peaks (tv series): why did leland kill ...,what is the most study scene in twin peaks?
2345792,"what does be ""in transit"" mean on fedex tracking?",how question fedex packages delivered?
2345793,what are some famous romanian drinks (alcoholi...,can a non-alcoholic restaurant be a huge success?
2345794,what were the best and worst things about publ...,what are the best and worst things examination...


In [11]:
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['is_duplicate'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 303217
Test size: 101073


In [14]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])
bert_model = AutoModel.from_pretrained(config['MODEL_NAME'])

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [17]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [18]:
pooler = MeanPooler()

model = SiameseContrastiveBERT(
    bert_model=bert_model,
    pooler=pooler,
).to(device)

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])

In [20]:
class ContrastiveLoss(torch.nn.Module):
    
    def __init__(self, margin: float):
        super().__init__()
        self.margin = margin
    
    def forward(self, x1: torch.Tensor, x2: torch.Tensor, tgt: torch.LongTensor) -> torch.Tensor:
        euclidian_distance = torch.nn.functional.pairwise_distance(x1, x2)
        
        positive_pairs_loss = tgt * euclidian_distance
        negative_pairs_loss = (1 - tgt) * torch.relu(self.margin - euclidian_distance)

        return (positive_pairs_loss + negative_pairs_loss).mean()

    
criterion = ContrastiveLoss(margin=config['MARGIN'])

In [21]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 10]



loop over train batches: 100%|██████████| 2369/2369 [17:05<00:00,  2.31it/s]


Train loss: 0.5440973055267092

Train metrics:
{'accuracy': 0.6310892858909626, 'precision': 0.967741935483871, 'recall': 0.0008039518700813778, 'f1': 0.0016065690824705464, 'roc_auc': 0.8956519850783511, 'log_loss': 5.606496789302467}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  8.04it/s]


Test loss:  0.4183670246525656

Test metrics:
{'accuracy': 0.6318304591730729, 'precision': 0.9482758620689655, 'recall': 0.002947797191553221, 'f1': 0.005877324214575763, 'roc_auc': 0.9108382207397486, 'log_loss': 4.78596155847873}

Epoch [2 / 10]



loop over train batches: 100%|██████████| 2369/2369 [17:05<00:00,  2.31it/s]


Train loss: 0.44650214005996025

Train metrics:
{'accuracy': 0.6436281606901988, 'precision': 0.9376547377897817, 'recall': 0.03721403878621133, 'f1': 0.07158690609158863, 'roc_auc': 0.9151327518922898, 'log_loss': 4.146628668694763}



loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.08it/s]


Test loss:  0.40534631821173656

Test metrics:
{'accuracy': 0.6667656050577305, 'precision': 0.9393280154701474, 'recall': 0.10413763533068925, 'f1': 0.18748944587846478, 'roc_auc': 0.9086321018335889, 'log_loss': 3.6290640577155857}

Epoch [3 / 10]



loop over train batches: 100%|██████████| 2369/2369 [17:08<00:00,  2.30it/s]


Train loss: 0.40917006958985236

Train metrics:
{'accuracy': 0.6870129313330058, 'precision': 0.9273392839233778, 'recall': 0.16519424370461022, 'f1': 0.28043278817793754, 'roc_auc': 0.924837822828506, 'log_loss': 3.6894847433261804}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  8.05it/s]


Test loss:  0.369601476079301

Test metrics:
{'accuracy': 0.7023042751278779, 'precision': 0.9245682058512513, 'recall': 0.2108746918212027, 'f1': 0.34342200013092716, 'roc_auc': 0.9176680967764886, 'log_loss': 3.9701220194862787}

Epoch [4 / 10]



loop over train batches:  80%|███████▉  | 1895/2369 [13:37<03:24,  2.32it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

loop over train batches: 100%|██████████| 2369/2369 [17:08<00:00,  2.30it/s]


Train loss: 0.3366064605719432

Train metrics:
{'accuracy': 0.7790724134860513, 'precision': 0.8941471454621966, 'recall': 0.4555280623866651, 'f1': 0.6035661236011576, 'roc_auc': 0.9462425647231006, 'log_loss': 2.8996173670369143}



loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.08it/s]


Test loss:  0.34601320041508615

Test metrics:
{'accuracy': 0.7715116796770651, 'precision': 0.8751318843637899, 'recall': 0.4445546146425126, 'f1': 0.5896005117998294, 'roc_auc': 0.9224930834428118, 'log_loss': 3.375259546974777}

Epoch [8 / 10]



loop over train batches:  81%|████████  | 1910/2369 [13:47<03:47,  2.02it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
torch.save(model.cpu().state_dict(), f'{experiment_name}.pth')

### evaluate model

In [23]:
model = SiameseContrastiveBERT(
    bert_model=bert_model,
    pooler=pooler,
)

model.load_state_dict(torch.load(f'{experiment_name}.pth'))
model.to(device)
model.eval();

In [24]:
train_metrics = compute_metrics_on_df_contrastive(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
)

vectorize question1: 100%|██████████| 2369/2369 [01:50<00:00, 21.40it/s]
vectorize question2: 100%|██████████| 2369/2369 [02:06<00:00, 18.67it/s]


In [25]:
train_metrics

{'accuracy': 0.8075074946325569,
 'precision': 0.8956345807366276,
 'recall': 0.5417474340536147,
 'f1': 0.6751270448233061,
 'roc_auc': 0.9565895236466382,
 'log_loss': 2.502801084654033}

In [26]:
test_metrics = compute_metrics_on_df_contrastive(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE']
)

vectorize question1: 100%|██████████| 790/790 [00:36<00:00, 21.77it/s]
vectorize question2: 100%|██████████| 790/790 [00:42<00:00, 18.72it/s]


In [27]:
test_metrics

{'accuracy': 0.7872923530517547,
 'precision': 0.8616306186839819,
 'recall': 0.5049576589130668,
 'f1': 0.6367491763115655,
 'roc_auc': 0.9230472721477609,
 'log_loss': 3.2070163058813783}

### submission

In [28]:
length = math.ceil(len(df_submission) / config['BATCH_SIZE'])

In [29]:
q1_emb = []
for texts in tqdm(
    chunks(df_submission['question1'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question1',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q1_emb.append(emb)

q2_emb = []
for texts in tqdm(
    chunks(df_submission['question2'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question2',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q2_emb.append(emb)

vectorize question1:  48%|████▊     | 8800/18327 [07:07<07:55, 20.04it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [30]:
y_score = []

for i in trange(length):
    with torch.no_grad():
        y_score_batch = model.exponent_neg_manhattan_distance(q1_emb[i], q2_emb[i])
        y_score_batch = y_score_batch.cpu().numpy()
    y_score.append(y_score_batch)

y_score = np.concatenate(y_score)

100%|██████████| 18327/18327 [00:01<00:00, 11053.29it/s]


In [31]:
df_submission['is_duplicate'] = y_score
df_submission['is_duplicate'].to_csv('submission.csv')