In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
import torch
from dataset import Collator, Dataset
from metrics import compute_metrics_on_df_sigmoid
from model import IntertowerConcatPoolerWithAbsDiffAndProduct, MeanPooler, SiameseSigmoidBERT
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange
from train_sigmoid import train
from transformers import AutoModel, AutoTokenizer
from utils import chunks, set_global_seed

2022-05-31 13:06:20.038466: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [3]:
# reproducibility
set_global_seed(42)

In [4]:
# parameters
config = {
    'MODEL_NAME':        'distilroberta-base',
    'BATCH_SIZE':        128,
    'LEARNING_RATE':     1e-5,
    'N_EPOCHS':          3,
    'INTERTOWER_POOLER': 'concat_with_abs_diff_and_product',
    'DROPOUT':           0.3,
}

In [5]:
# tensorboard
experiment_name = f"MODEL_SIGMOID_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_INTERTOWER_POOLER_{config['INTERTOWER_POOLER']}_DROPOUT_{config['DROPOUT']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### prepare data

In [7]:
df = pd.read_csv('./data/train.csv', index_col='id')
df.fillna('', inplace=True)

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [8]:
df_submission = pd.read_csv('./data/test.csv', index_col='test_id')
df_submission.fillna('', inplace=True)

df_submission['question1'] = df_submission['question1'].str.lower()
df_submission['question2'] = df_submission['question2'].str.lower()

In [9]:
df

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,3,4,what is the story of kohinoor (koh-i-noor) dia...,what would happen if the indian government sto...,0
2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,7,8,why am i mentally very lonely? how can i solve...,find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"which one dissolve in water quikly sugar, salt...",which fish would survive in salt water?,0
...,...,...,...,...,...
404285,433578,379845,how many keywords are there in the racket prog...,how many keywords are there in perl programmin...,0
404286,18840,155606,do you believe there is life after death?,is it true that there is life after death?,1
404287,537928,537929,what is one coin?,what's this coin?,0
404288,537930,537931,what is the approx annual cost of living while...,i am having little hairfall problem but i want...,0


In [10]:
df_submission

Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,should i have a hair transplant at age 24? how...,how much cost does hair transplant require?
2,what but is the best way to send money from ch...,what you send money to china?
3,which food not emulsifiers?,what foods fibre?
4,"how ""aberystwyth"" start reading?",how their can i start reading?
...,...,...
2345791,how do peaks (tv series): why did leland kill ...,what is the most study scene in twin peaks?
2345792,"what does be ""in transit"" mean on fedex tracking?",how question fedex packages delivered?
2345793,what are some famous romanian drinks (alcoholi...,can a non-alcoholic restaurant be a huge success?
2345794,what were the best and worst things about publ...,what are the best and worst things examination...


In [11]:
df['is_duplicate'].value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [12]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['is_duplicate'],
)

In [13]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

Train size: 303217
Test size: 101073


In [14]:
train_dataset = Dataset(df=df_train)
test_dataset = Dataset(df=df_test)

### load bert model

In [15]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])
bert_model = AutoModel.from_pretrained(config['MODEL_NAME'])

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = Collator(tokenizer, tokenizer_kwargs)

In [17]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [18]:
pooler = MeanPooler()
intertower_pooler = IntertowerConcatPoolerWithAbsDiffAndProduct(
    hidden_size=bert_model.config.hidden_size,
    dropout_p=config['DROPOUT'],
)

model = SiameseSigmoidBERT(
    bert_model=bert_model,
    pooler=pooler,
    intertower_pooler=intertower_pooler,
).to(device)

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])
criterion = torch.nn.BCEWithLogitsLoss()

In [20]:
# Epoch [1 / 5]

# loop over train batches: 100%|██████████| 2369/2369 [17:07<00:00,  2.31it/s]
# Train loss: 0.4163909311685043

# Train metrics:
# {'accuracy': 0.8127974354999885, 'precision': 0.7294972884852114, 'recall': 0.7834600301928591, 'f1': 0.7555163131258749, 'roc_auc': 0.8948988812971175, 'log_loss': 0.40020405802044656}

# loop over test batches: 100%|██████████| 790/790 [01:36<00:00,  8.20it/s]
# Test loss:  0.33709257352578487

# Test metrics:
# {'accuracy': 0.8508998446667261, 'precision': 0.7533135959918014, 'recall': 0.8864294136563404, 'f1': 0.8144682737054637, 'roc_auc': 0.9320733413027357, 'log_loss': 0.337106946905435}

# Epoch [2 / 5]

# loop over train batches: 100%|██████████| 2369/2369 [17:07<00:00,  2.31it/s]
# Train loss: 0.30564182498576825

# Train metrics:
# {'accuracy': 0.8711747692246806, 'precision': 0.7787040081984139, 'recall': 0.9095464818172886, 'f1': 0.8390549805524425, 'roc_auc': 0.9480013992731323, 'log_loss': 0.29978105404007654}

# loop over test batches: 100%|██████████| 790/790 [01:36<00:00,  8.18it/s]
# Test loss:  0.29870855314067646

# Test metrics:
# {'accuracy': 0.8712415778694607, 'precision': 0.7937436542087705, 'recall': 0.8798906635223497, 'f1': 0.8346000355862839, 'roc_auc': 0.9444570222697088, 'log_loss': 0.2987243992022033}

# Epoch [3 / 5]

# loop over train batches: 100%|██████████| 2369/2369 [17:09<00:00,  2.30it/s]
# Train loss: 0.26325977560323016

# Train metrics:
# {'accuracy': 0.8936240382300464, 'precision': 0.8084485454629902, 'recall': 0.9329146828409872, 'f1': 0.8662334269208598, 'roc_auc': 0.962773045644309, 'log_loss': 0.2589117062898208}

# loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.07it/s]
# Test loss:  0.3014029575110991

# Test metrics:
# {'accuracy': 0.8769997922293787, 'precision': 0.7881960529973131, 'recall': 0.9118876621288455, 'f1': 0.8455421926249876, 'roc_auc': 0.9510568682238338, 'log_loss': 0.30142621436326394}

train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

Epoch [1 / 3]



loop over train batches: 100%|██████████| 2369/2369 [17:11<00:00,  2.30it/s]


Train loss: 0.42829841891136183

Train metrics:
{'accuracy': 0.8074514291744856, 'precision': 0.7224395551458068, 'recall': 0.7769837512394258, 'f1': 0.7487195820027028, 'roc_auc': 0.8910742623568093, 'log_loss': 0.40829267640705047}



loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.12it/s]


Test loss:  0.34271460718746427

Test metrics:
{'accuracy': 0.846952202863277, 'precision': 0.7471659690010182, 'recall': 0.8849019187479902, 'f1': 0.810221933236005, 'roc_auc': 0.9300379730593327, 'log_loss': 0.34273322790793237}

Epoch [2 / 3]



loop over train batches: 100%|██████████| 2369/2369 [17:10<00:00,  2.30it/s]


Train loss: 0.3155908728637188

Train metrics:
{'accuracy': 0.8690871554035559, 'precision': 0.7763151856327729, 'recall': 0.9066522550849956, 'f1': 0.8364367565072994, 'roc_auc': 0.9460505694968718, 'log_loss': 0.3040032696147244}



loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.08it/s]


Test loss:  0.2977299100047425

Test metrics:
{'accuracy': 0.8713108347432054, 'precision': 0.7945617138841092, 'recall': 0.878604352020581, 'f1': 0.8344723144860586, 'roc_auc': 0.9441787580697116, 'log_loss': 0.2977413130128941}

Epoch [3 / 3]



loop over train batches: 100%|██████████| 2369/2369 [17:11<00:00,  2.30it/s]


Train loss: 0.2735264627314057

Train metrics:
{'accuracy': 0.8905470339723697, 'precision': 0.8037674429368159, 'recall': 0.9307797439859934, 'f1': 0.8626233525399033, 'roc_auc': 0.9610725899474938, 'log_loss': 0.2637969423360068}



loop over test batches: 100%|██████████| 790/790 [01:37<00:00,  8.06it/s]


Test loss:  0.3004275713917575

Test metrics:
{'accuracy': 0.8754464594896758, 'precision': 0.7868761166670534, 'recall': 0.9087790759995712, 'f1': 0.8434457115143074, 'roc_auc': 0.95010961075136, 'log_loss': 0.30044429355736496}



In [21]:
torch.save(model.cpu().state_dict(), f'{experiment_name}.pth')

### evaluate model

In [22]:
model = SiameseSigmoidBERT(
    bert_model=bert_model,
    pooler=pooler,
    intertower_pooler=intertower_pooler,
)

model.load_state_dict(torch.load(f'{experiment_name}.pth'))
model.to(device)
model.eval();

In [23]:
train_metrics = compute_metrics_on_df_sigmoid(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
)

vectorize question1: 100%|██████████| 2369/2369 [01:50<00:00, 21.50it/s]
vectorize question2: 100%|██████████| 2369/2369 [02:06<00:00, 18.68it/s]


In [24]:
train_metrics

{'accuracy': 0.9048305339080592,
 'precision': 0.8236297634998286,
 'recall': 0.9444737241730462,
 'f1': 0.8799221035373815,
 'roc_auc': 0.9692012430050605,
 'log_loss': 0.2353047069505681}

In [25]:
test_metrics = compute_metrics_on_df_sigmoid(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE']
)

vectorize question1: 100%|██████████| 790/790 [00:36<00:00, 21.75it/s]
vectorize question2: 100%|██████████| 790/790 [00:42<00:00, 18.71it/s]


In [26]:
test_metrics

{'accuracy': 0.8754464594896758,
 'precision': 0.7868761166670534,
 'recall': 0.9087790759995712,
 'f1': 0.8434457115143074,
 'roc_auc': 0.950109611591994,
 'log_loss': 0.3004442941175487}

### submission

In [27]:
length = math.ceil(len(df_submission) / config['BATCH_SIZE'])

In [28]:
q1_emb = []
for texts in tqdm(
    chunks(df_submission['question1'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question1',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q1_emb.append(emb)

q2_emb = []
for texts in tqdm(
    chunks(df_submission['question2'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question2',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q2_emb.append(emb)

vectorize question1: 100%|██████████| 18327/18327 [14:52<00:00, 20.54it/s]
vectorize question2: 100%|██████████| 18327/18327 [15:08<00:00, 20.18it/s]


In [29]:
y_score = []

for i in trange(length):
    with torch.no_grad():
        y_score_batch = model.intertower_pooler(q1_emb[i], q2_emb[i]).sigmoid()
        y_score_batch = y_score_batch.cpu().numpy()
    y_score.append(y_score_batch)

y_score = np.concatenate(y_score)

100%|██████████| 18327/18327 [00:02<00:00, 6222.14it/s]


In [30]:
df_submission['is_duplicate'] = y_score
df_submission['is_duplicate'].to_csv('submission.csv')