In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math

import numpy as np
import pandas as pd
import torch
from dataset import TripletCollator, TripletDataset
from metrics import compute_metrics_on_df
from nn_modules.poolers import MeanPooler
from nn_modules.triplet_models import SiameseTripletBERT
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm, trange
from train_triplet import train
from transformers import AutoModel, AutoTokenizer
from utils import chunks, set_global_seed

In [None]:
# reproducibility
set_global_seed(42)

In [None]:
# parameters
config = {
    'MODEL_NAME':         'distilroberta-base',
    'BATCH_SIZE':         32,
    'LEARNING_RATE':      1e-5,
    'N_EPOCHS':           5,
    'MARGIN':             2,
    'N_NEGATIVE_SAMPLES': 8,
}

In [None]:
# tensorboard
experiment_name = f"MODEL_CONTRASTIVE_{config['MODEL_NAME']}_BATCH_{config['BATCH_SIZE']}_LR_{config['LEARNING_RATE']}_MARGIN_{config['MARGIN']}_N_NEGATIVE_SAMPLES_{config['N_NEGATIVE_SAMPLES']}"

writer = SummaryWriter(
    log_dir=f"runs/{experiment_name}",
)

In [None]:
# device
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device

### prepare data

In [None]:
df_submission = pd.read_csv('./data/test.csv', index_col='test_id')
df_submission.fillna('', inplace=True)

df_submission['question1'] = df_submission['question1'].str.lower()
df_submission['question2'] = df_submission['question2'].str.lower()

In [None]:
df_submission

In [None]:
df = pd.read_csv('./data/train.csv', index_col='id')
df.fillna('', inplace=True)

df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [None]:
df

In [None]:
df['is_duplicate'].value_counts()

In [None]:
df_train, df_test = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    shuffle=True,
    stratify=df['is_duplicate'],
)

In [None]:
print(f'Train size: {df_train.shape[0]}')
print(f'Test size: {df_test.shape[0]}')

In [None]:
df_train_triplet = df_train[df_train['is_duplicate'] == 1]
df_test_triplet = df_test[df_test['is_duplicate'] == 1]

In [None]:
print(f'Train triplet size: {df_train_triplet.shape[0]}')
print(f'Test triplet size: {df_test_triplet.shape[0]}')

In [None]:
train_dataset = TripletDataset(df=df_train_triplet, n_negative_samples=config['N_NEGATIVE_SAMPLES'])
test_dataset = TripletDataset(df=df_test_triplet, n_negative_samples=config['N_NEGATIVE_SAMPLES'])

### load bert model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['MODEL_NAME'])
bert_model = AutoModel.from_pretrained(config['MODEL_NAME'])

In [None]:
tokenizer_kwargs = {
    'return_tensors': 'pt',
    'padding':        True,
    'truncation':     True,
    'max_length':     512,
}

collate_fn = TripletCollator(tokenizer, tokenizer_kwargs)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn,
)
test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=config['BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn,
)

### train model

In [None]:
pooler = MeanPooler()

model = SiameseTripletBERT(
    bert_model=bert_model,
    pooler=pooler,
).to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['LEARNING_RATE'])
criterion = torch.nn.TripletMarginLoss(margin=config['MARGIN'])

In [None]:
train(
    n_epochs=config['N_EPOCHS'],
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    writer=writer,
    device=device,
)

In [None]:
torch.save(model.cpu().state_dict(), f'{experiment_name}.pth')

### evaluate model

In [None]:
model = SiameseTripletBERT(
    bert_model=bert_model,
    pooler=pooler,
)

model.load_state_dict(torch.load(f'{experiment_name}.pth'))
model.to(device)
model.eval();

In [None]:
train_metrics = compute_metrics_on_df(
    model=model,
    df=df_train,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE'],
)

In [None]:
train_metrics

In [None]:
test_metrics = compute_metrics_on_df(
    model=model,
    df=df_test,
    tokenizer=tokenizer,
    tokenizer_kwargs=tokenizer_kwargs,
    batch_size=config['BATCH_SIZE']
)

In [None]:
test_metrics

### submission

In [None]:
df_submission = df_submission.sample(5000)

In [None]:
length = math.ceil(len(df_submission) / config['BATCH_SIZE'])

In [None]:
q1_emb = []
for texts in tqdm(
    chunks(df_submission['question1'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question1',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q1_emb.append(emb)

q2_emb = []
for texts in tqdm(
    chunks(df_submission['question2'].to_list(), n=config['BATCH_SIZE']),
    total=length,
    desc='vectorize question2',
):
    emb = model.vectorize(texts, tokenizer, tokenizer_kwargs)
    q2_emb.append(emb)

In [None]:
y_score = []

for i in trange(length):
    y_score_batch = model.similarity(q1_emb[i], q2_emb[i]).cpu().numpy()
    y_score.append(y_score_batch)

y_score = np.concatenate(y_score)

In [None]:
df_submission['is_duplicate'] = y_score
df_submission['is_duplicate'].to_csv('submission.csv')