In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from dataset import Collator, Dataset
from metrics import compute_metrics_on_df
from nn_modules.poolers import MeanPooler
from nn_modules.models import SiameseManhattanBERT
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange
from train import train
from transformers import AutoModel, AutoTokenizer
from utils import chunks, set_global_seed

In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':    'distilroberta-base',
    'BATCH_SIZE':    128,
    'LEARNING_RATE': 1e-5,
    'N_EPOCHS':      2,
    'LOSS':          'mse',
}

In [5]:
# tensorboard
experiment_name = f"MODEL_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_LOSS_{config['LOSS']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=1)

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', index_col='id')
df.fillna('', inplace=True)

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [8]:
df_submission = pd.read_csv('./data/test.csv', index_col='test_id')
df_submission.fillna('', inplace=True)

df_submission['question1'] = df_submission['question1'].str.lower()
df_submission['question2'] = df_submission['question2'].str.lower()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
df

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0
2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,7,8,why am i mentally very lonely? how can i solve...,find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"which one dissolve in water quikly sugar, salt...",which fish would survive in salt water?,0
...,...,...,...,...,...
404285,433578,379845,how many keywords are there in the racket prog...,how many keywords are there in perl programmin...,0
404286,18840,155606,do you believe there is life after death?,is it true that there is life after death?,1
404287,537928,537929,what is one coin?,what's this coin?,0
404288,537930,537931,what is the approx annual cost of living while...,i am having little hairfall problem but i want...,0


In [10]:
df_submission

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,should i have a hair transplant at age 24? how...,how much cost does hair transplant require?
2,what but is the best way to send money from ch...,what you send money to china?
3,which food not emulsifiers?,what foods fibre?
4,"how ""aberystwyth"" start reading?",how their can i start reading?
...,...,...
2345791,how do peaks (tv series): why did leland kill ...,what is the most study scene in twin peaks?
2345792,"what does be ""in transit"" mean on fedex tracking?",how question fedex packages delivered?
2345793,what are some famous romanian drinks (alcoholi...,can a non-alcoholic restaurant be a huge success?
2345794,what were the best and worst things about publ...,what are the best and worst things examination...


In [11]:
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['is_duplicate'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 303217
Test size: 101073


In [14]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])
bert_model = AutoModel.from_pretrained(config['MODEL_NAME'])

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [17]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [18]:
pooler = MeanPooler()

model = SiameseManhattanBERT(
    bert_model=bert_model,
    pooler=pooler,
).to(device)

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])
criterion = torch.nn.MSELoss()

In [20]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 2]



loop over train batches: 100%|██████████| 2369/2369 [07:46<00:00,  5.07it/s]


Train loss: 0.21991332367836247

Train accuracy: 0.6625240010160741

Train precision: 0.5211587395873062

Train recall: 0.856315755987575

Train f1: 0.6386560737365088

Train roc_auc: 0.8083892036587743

Train log_loss: 0.7948418949703847



loop over test batches: 100%|██████████| 790/790 [00:50<00:00, 15.61it/s]


Test loss:  0.1637208505710469

Test accuracy: 0.7613203723238007

Test precision: 0.6294062070953276

Test recall: 0.8594910060093512

Test f1: 0.7249311310510309

Test roc_auc: 0.8597037457484802

Test log_loss: 0.4961717344653053

Epoch [2 / 2]



loop over train batches: 100%|██████████| 2369/2369 [07:45<00:00,  5.08it/s]


Train loss: 0.156595030025044

Train accuracy: 0.7917284980780509

Train precision: 0.6755341433494803

Train recall: 0.8420931848561625

Train f1: 0.747427995996896

Train roc_auc: 0.8790165307712895

Train log_loss: 0.4490912747380957



loop over test batches: 100%|██████████| 790/790 [00:50<00:00, 15.52it/s]

Test loss:  0.1359457102002977

Test accuracy: 0.8056330823175496

Test precision: 0.7067904329394109

Test recall: 0.8093467883394405

Test f1: 0.7527916828539939

Test roc_auc: 0.8857503403054725

Test log_loss: 0.42221644105392425






In [21]:
torch.save(model.cpu().state_dict(), f'{experiment_name}.pth')

### evaluate model

In [22]:
model = SiameseManhattanBERT(
    bert_model=bert_model,
    pooler=pooler,
)

model.load_state_dict(torch.load(f'{experiment_name}.pth'))
model.to(device)
model.eval();

In [23]:
train_metrics = compute_metrics_on_df(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
)

vectorize question1: 100%|██████████| 2369/2369 [00:44<00:00, 53.19it/s]
vectorize question2: 100%|██████████| 2369/2369 [00:50<00:00, 46.80it/s]


In [24]:
train_metrics

{'accuracy': 0.8154028303162422,
 'precision': 0.7174862453762706,
 'recall': 0.8247563579193725,
 'f1': 0.7673907351920576,
 'roc_auc': 0.8949298493907045,
 'log_loss': 0.41018909422128913}

In [25]:
test_metrics = compute_metrics_on_df(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE']
)

vectorize question1: 100%|██████████| 790/790 [00:14<00:00, 55.35it/s]
vectorize question2: 100%|██████████| 790/790 [00:16<00:00, 49.23it/s]


In [26]:
test_metrics

{'accuracy': 0.8056058492376797,
 'precision': 0.7067399953194476,
 'recall': 0.8092775217065066,
 'f1': 0.7545411388451639,
 'roc_auc': 0.8861379180847163,
 'log_loss': 0.4222247869476497}

### submission

In [27]:
length = math.ceil(len(df_submission) / config['BATCH_SIZE'])

In [28]:
q1_emb = []
for texts in tqdm(
    chunks(df_submission['question1'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question1',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q1_emb.append(emb)

q2_emb = []
for texts in tqdm(
    chunks(df_submission['question2'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question2',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q2_emb.append(emb)

vectorize question1: 100%|██████████| 27840/27840 [09:11<00:00, 50.51it/s]
vectorize question2: 100%|██████████| 27840/27840 [09:24<00:00, 49.32it/s]


In [29]:
y_score = []

for i in trange(length):
    y_score_batch = model.similarity(q1_emb[i], q2_emb[i]).cpu().numpy()
    y_score.append(y_score_batch)

y_score = np.concatenate(y_score)

100%|██████████| 27840/27840 [00:04<00:00, 6084.27it/s]


In [30]:
df_submission['is_duplicate'] = y_score
df_submission['is_duplicate'].to_csv('submission.csv')