In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from dataset import Collator, Dataset
from metrics import compute_metrics_on_df
from model import MeanPooler, SiameseManhattanBERT
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange
from train import train
from transformers import AutoModel, AutoTokenizer
from utils import chunks, set_global_seed

2022-05-28 22:00:08.278192: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':    'distilroberta-base',
    'BATCH_SIZE':    128,
    'LEARNING_RATE': 1e-5,
    'N_EPOCHS':      5,
    'LOSS':          'mse',
}

In [5]:
# tensorboard
experiment_name = f"MODEL_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_LOSS_{config['LOSS']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', index_col='id')
df.fillna('', inplace=True)

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [8]:
df_submission = pd.read_csv('./data/test.csv', index_col='test_id')
df_submission.fillna('', inplace=True)

df_submission['question1'] = df_submission['question1'].str.lower()
df_submission['question2'] = df_submission['question2'].str.lower()

In [9]:
df

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0
2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,7,8,why am i mentally very lonely? how can i solve...,find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"which one dissolve in water quikly sugar, salt...",which fish would survive in salt water?,0
...,...,...,...,...,...
404285,433578,379845,how many keywords are there in the racket prog...,how many keywords are there in perl programmin...,0
404286,18840,155606,do you believe there is life after death?,is it true that there is life after death?,1
404287,537928,537929,what is one coin?,what's this coin?,0
404288,537930,537931,what is the approx annual cost of living while...,i am having little hairfall problem but i want...,0


In [10]:
df_submission

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,should i have a hair transplant at age 24? how...,how much cost does hair transplant require?
2,what but is the best way to send money from ch...,what you send money to china?
3,which food not emulsifiers?,what foods fibre?
4,"how ""aberystwyth"" start reading?",how their can i start reading?
...,...,...
2345791,how do peaks (tv series): why did leland kill ...,what is the most study scene in twin peaks?
2345792,"what does be ""in transit"" mean on fedex tracking?",how question fedex packages delivered?
2345793,what are some famous romanian drinks (alcoholi...,can a non-alcoholic restaurant be a huge success?
2345794,what were the best and worst things about publ...,what are the best and worst things examination...


In [11]:
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['is_duplicate'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 303217
Test size: 101073


In [14]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])
bert_model = AutoModel.from_pretrained(config['MODEL_NAME'])

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [17]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [18]:
pooler = MeanPooler()

model = SiameseManhattanBERT(
    bert_model=bert_model,
    pooler=pooler,
).to(device)

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])
criterion = torch.nn.MSELoss()

In [20]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 5]



loop over train batches: 100%|██████████| 2369/2369 [17:07<00:00,  2.31it/s]


Train loss: 0.22001007518822657

Train metrics:
{'accuracy': 0.663570973922966, 'precision': 0.5273637595426154, 'recall': 0.8552707977882391, 'f1': 0.6524338928998539, 'roc_auc': 0.7756308647238231, 'log_loss': 0.7989217927431926}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  8.02it/s]


Test loss:  0.15704511377630354

Test metrics:
{'accuracy': 0.7725307451050231, 'precision': 0.6513918539028979, 'recall': 0.8258655804480651, 'f1': 0.7283254753211151, 'roc_auc': 0.8581106438924322, 'log_loss': 0.47784150054480334}

Epoch [2 / 5]



loop over train batches: 100%|██████████| 2369/2369 [17:08<00:00,  2.30it/s]


Train loss: 0.1563273778994308

Train metrics:
{'accuracy': 0.7928480263309775, 'precision': 0.6753731609642579, 'recall': 0.8451410042252138, 'f1': 0.7507796567129832, 'roc_auc': 0.8791231666281533, 'log_loss': 0.4496532116126594}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  8.03it/s]


Test loss:  0.13496382240253157

Test metrics:
{'accuracy': 0.8082079289226599, 'precision': 0.7145523727475052, 'recall': 0.8001661485689785, 'f1': 0.7549397620823484, 'roc_auc': 0.8867174901586495, 'log_loss': 0.420657624989739}

Epoch [3 / 5]



loop over train batches: 100%|██████████| 2369/2369 [17:10<00:00,  2.30it/s]


Train loss: 0.14208440886124019

Train metrics:
{'accuracy': 0.8228265565585043, 'precision': 0.7263784885031999, 'recall': 0.834439511554575, 'f1': 0.7766682741074546, 'roc_auc': 0.9010449327337875, 'log_loss': 0.40389185488591395}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  8.03it/s]


Test loss:  0.13187678080004983

Test metrics:
{'accuracy': 0.8148170134457273, 'precision': 0.71256485862534, 'recall': 0.8354057240861829, 'f1': 0.7691112070560662, 'roc_auc': 0.8975561540386991, 'log_loss': 0.4144080789853708}

Epoch [4 / 5]



loop over train batches: 100%|██████████| 2369/2369 [17:07<00:00,  2.31it/s]


Train loss: 0.13330345728531726

Train metrics:
{'accuracy': 0.839108625176029, 'precision': 0.7543941615247056, 'recall': 0.8365744504095688, 'f1': 0.7933618254132975, 'roc_auc': 0.9141987173983612, 'log_loss': 0.3773985642457236}



loop over test batches: 100%|██████████| 790/790 [01:39<00:00,  7.95it/s]


Test loss:  0.1263594644073444

Test metrics:
{'accuracy': 0.8238599823889664, 'precision': 0.7263846671462051, 'recall': 0.8389162825597599, 'f1': 0.7786054493676394, 'roc_auc': 0.9049956386386285, 'log_loss': 0.3999048400244002}

Epoch [5 / 5]



loop over train batches: 100%|██████████| 2369/2369 [17:07<00:00,  2.31it/s]


Train loss: 0.12630591844395275

Train metrics:
{'accuracy': 0.8523664570258264, 'precision': 0.777208358090711, 'recall': 0.8412820352488231, 'f1': 0.8079769048956987, 'roc_auc': 0.9243450792548016, 'log_loss': 0.35674110925818264}



loop over test batches: 100%|██████████| 790/790 [01:38<00:00,  7.99it/s]


Test loss:  0.1174412862220897

Test metrics:
{'accuracy': 0.8370484699177821, 'precision': 0.7697882694000103, 'recall': 0.7969771679708436, 'f1': 0.7831468071099407, 'roc_auc': 0.910326870962099, 'log_loss': 0.37396140408810946}



In [21]:
torch.save(model.cpu().state_dict(), f'{experiment_name}.pth')

### evaluate model

In [22]:
model = SiameseManhattanBERT(
    bert_model=bert_model,
    pooler=pooler,
)

model.load_state_dict(torch.load(f'{experiment_name}.pth'))
model.to(device)
model.eval();

In [23]:
train_metrics = compute_metrics_on_df(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
)

vectorize question1: 100%|██████████| 2369/2369 [02:00<00:00, 19.64it/s]
vectorize question2: 100%|██████████| 2369/2369 [02:18<00:00, 17.11it/s]


In [24]:
train_metrics

{'accuracy': 0.8622438715507376,
 'precision': 0.800006840003762,
 'recall': 0.8358240953308262,
 'f1': 0.8175233501961503,
 'roc_auc': 0.9307760071273972,
 'log_loss': 0.34079214853336454}

In [25]:
test_metrics = compute_metrics_on_df(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE']
)

vectorize question1: 100%|██████████| 790/790 [00:39<00:00, 19.80it/s]
vectorize question2: 100%|██████████| 790/790 [00:46<00:00, 17.09it/s]


In [26]:
test_metrics

{'accuracy': 0.8370484699177821,
 'precision': 0.7697882694000103,
 'recall': 0.7969771679708436,
 'f1': 0.7831468071099407,
 'roc_auc': 0.9103268491056105,
 'log_loss': 0.37396133128660103}

### submission

In [27]:
length = math.ceil(len(df_submission) / config['BATCH_SIZE'])

In [28]:
q1_emb = []
for texts in tqdm(
    chunks(df_submission['question1'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question1',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q1_emb.append(emb)

q2_emb = []
for texts in tqdm(
    chunks(df_submission['question2'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question2',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q2_emb.append(emb)

vectorize question1: 100%|██████████| 18327/18327 [16:16<00:00, 18.76it/s]
vectorize question2: 100%|██████████| 18327/18327 [16:34<00:00, 18.42it/s]


In [29]:
y_score = []

for i in trange(length):
    y_score_batch = model.exponent_neg_manhattan_distance(q1_emb[i], q2_emb[i], type='np')
    y_score.append(y_score_batch)

y_score = np.concatenate(y_score)

100%|██████████| 18327/18327 [00:06<00:00, 2659.24it/s]


In [30]:
df_submission['is_duplicate'] = y_score
df_submission['is_duplicate'].to_csv('submission.csv')