In [1]:
import torch
import os
import sys
import pandas as pd
from tqdm.autonotebook import tqdm, trange
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset

from datetime import datetime

from transformers.transformers import AdamW, WarmupLinearSchedule
from transformers.transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from transformers import transformers
from transformers.transformers import RobertaTokenizer, RobertaModel, RobertaConfig

from layers import VectorAttention, NNAttention, Seq2SeqAttention
from utils import QuoraSentences, collate_fn, collate_fn_test, evaluate
from models import SentenceClf



In [62]:
# model_weights = 'roberta-base'
model_weights = 'bert-base-uncased'

In [3]:
# tokenizer = RobertaTokenizer.from_pretrained(model_weights)
tokenizer = BertTokenizer.from_pretrained(model_weights, do_lower_case=False)

In [4]:
# model = RobertaModel.from_pretrained(model_weights, output_hidden_states=True, output_attentions=True).cuda()
model=BertForSequenceClassification.from_pretrained(model_weights, output_hidden_states=True).cuda()
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [36]:
train = pd.read_csv('./data/train.csv', index_col='id')
train.dropna(axis=0, inplace=True)
test = pd.read_csv('data/test.csv', index_col='test_id')

  mask |= (ar1 == a)


In [86]:
def encode_str(string, *args, **kwargs):
    return tokenizer.encode(string, *args, **kwargs)

class QuoraSentences(torch.utils.data.Dataset):
    def __init__(self, df, tk, train=True):
        self.train = train
        self.df = df
        if self.train:
            self.df.dropna(inplace=True, axis=0)
        self.enc = tk.encode
    
    def __getitem__(self, idx):
        q_1, q_2 = self.df.iloc[idx][['question1', 'question2']]
        enc_1 = self.enc(q_1.lower(), add_special_tokens=True, return_tensors='pt').squeeze()        
        enc_2 = self.enc(q_2.lower(), add_special_tokens=True, return_tensors='pt').squeeze()
        if self.train:
            is_dup = self.df.iloc[idx]['is_duplicate']
            return enc_1, enc_2, is_dup
        return enc_1, enc_2
        
    def __len__(self):
        return len(self.df)

In [88]:
def collate_fn(batch):
    #calculate max length
    max1 = max([item[0].size() for item in batch])
    max2 = max([item[1].size() for item in batch])
    
    q1_batch, q1_mask, q2_batch, q2_mask = [], [], [], []
    y = []
    
    for enc_1, enc_2, is_dup in batch:
        padded_1 = enc_1.new_zeros(max1)
        padded_1[:len(enc_1)] = enc_1
        att_mask_1 = enc_1.new_zeros(max1, dtype=torch.float)
        att_mask_1[:len(enc_1)] = 1
        q1_batch.append(padded_1)
        q1_mask.append(att_mask_1)
        
        padded_2 = enc_2.new_zeros(max2)
        padded_2[:len(enc_2)] = enc_2
        att_mask_2 = enc_2.new_zeros(max2, dtype=torch.float)
        att_mask_2[:len(enc_2)] = 1
        q2_batch.append(padded_2)
        q2_mask.append(att_mask_2)
        
        y.append(is_dup)
        
    return torch.stack(q1_batch), torch.stack(q1_mask), torch.stack(q2_batch), torch.stack(q2_mask), torch.tensor(y)

In [89]:
def collate_fn_test(batch):
    #calculate max length
    max1 = max([item[0].size() for item in batch])
    max2 = max([item[1].size() for item in batch])
    
    q1_batch, q1_mask, q2_batch, q2_mask = [], [], [], []
    
    for enc_1, enc_2 in batch:
        padded_1 = enc_1.new_zeros(max1)
        padded_1[:len(enc_1)] = enc_1
        att_mask_1 = enc_1.new_zeros(max1, dtype=torch.float)
        att_mask_1[:len(enc_1)] = 1
        q1_batch.append(padded_1)
        q1_mask.append(att_mask_1)
        
        padded_2 = enc_2.new_zeros(max2)
        padded_2[:len(enc_2)] = enc_2
        att_mask_2 = enc_2.new_zeros(max2, dtype=torch.float)
        att_mask_2[:len(enc_2)] = 1
        q2_batch.append(padded_2)
        q2_mask.append(att_mask_2)
    return torch.stack(q1_batch), torch.stack(q1_mask), torch.stack(q2_batch), torch.stack(q2_mask)

In [90]:
class SentenceClf(torch.nn.Module):
    def __init__(self, emb_model):
        super(SentenceClf, self).__init__()
        self.emb_model = emb_model
        self.emb_size = 768
        self.clf = torch.nn.Sequential(
            torch.nn.Linear(self.emb_size * 2, 512),
            torch.nn.Dropout(),
            torch.nn.LayerNorm(512),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 512),
            torch.nn.LayerNorm(512),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 256),
            torch.nn.LayerNorm(256),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(256, 2)
        ).cuda()
        self.attn_block = attn_block.cuda()
        
    
    def forward(self, enc_1, mask_1, enc_2, mask_2):
        #average, concatenate, process with mlp
        
        with torch.no_grad():
            hidden_1 = self.emb_model(enc_1, attention_mask = mask_1)[2][-2]
            hidden_2 = self.emb_model(enc_2, attention_mask = mask_2)[2][-2]

            hidden_1_count = mask_1.sum(axis=1, keepdims=True)
            hidden_2_count = mask_2.sum(axis=1, keepdims=True)
        
            first = (hidden_1 * mask_1.unsqueeze(2)).sum(axis=1) / hidden_1_count
            second = (hidden_2 * mask_2.unsqueeze(2)).sum(axis=1) / hidden_2_count
        
#         first, second = self.attn_block(hidden_1, mask_1, hidden_2, mask_2)
    
        #input: batch_size x word_size x embed_dim
        mlp_input = torch.cat(
            (first, second),
            axis = 1
        )

        return self.clf(mlp_input)

In [11]:
attn = VectorAttention(768)

sc = SentenceClf(model, attn)
#sc.clf.load_state_dict(torch.load('models/clf_head_weight'))

In [12]:
ds_train = QuoraSentences(train[:-5000], tokenizer)
ds_val = QuoraSentences(train.iloc[-5000:], tokenizer)

train_loader = DataLoader(ds_train, batch_size=100, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=50, collate_fn=collate_fn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [13]:
N_EPOCHS = 100

In [6]:
from torch.optim import Adam, Adadelta, SGD
from torch.nn import CrossEntropyLoss

In [7]:
from itertools import chain

In [16]:
optim = Adam(chain(sc.clf.parameters(), sc.attn_block.parameters()), lr=0.001)

In [8]:
from transformers.transformers import WarmupCosineSchedule
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau

In [18]:
scheduler = MultiStepLR(optim, milestones=[20, 40], gamma=0.5)

In [19]:
loss = CrossEntropyLoss()

In [9]:
from tensorboardX import SummaryWriter

writer = SummaryWriter()

In [24]:
start_epoch = 0 if iter_num is None else iter_num

In [26]:
for iter_num in tqdm(range(start_epoch, N_EPOCHS), position=0):
    sc.clf.eval()
    val_list = []
    for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)
    writer.add_scalar('data/val_logloss', sum(val_list) / len(val_list), iter_num)
    
    if iter_num > 0 and iter_num % 5 == 0:
        torch.save(
        {
            'epoch': iter_num,
            'model_state_dict': sc.clf.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': loss,
            'val_metric': sum(val_list) / len(val_list)
        }, 'models/checkpoint_iter_{}_{}'.format(iter_num, datetime.now()))
    
    
    sc.clf.train()
    acc_loss = 0
    n_batches = 0
    for q1, m1, q2, m2, target in train_loader:
        optim.zero_grad()
        outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
        lv = loss(outs, target.cuda()).mean()
        #writer.add_scalar('data/train_logloss', lv.item(), iter_num)
        acc_loss+=lv.item()
        n_batches+=1
        lv.backward()
        optim.step()
    writer.add_scalar('data/train_logloss', acc_loss / n_batches, iter_num)
    
    scheduler.step()
            
writer.export_scalars_to_json('./scalars.json')
writer.close()

  "type " + obj.__name__ + ". It won't be checked "
 10%|█         | 10/100 [6:25:35<57:51:27, 2314.31s/it]

KeyboardInterrupt: 

In [24]:
val_list = []
sc.eval()
for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)

In [25]:
torch.save(sc.clf.state_dict(), './models/clf_head_weight')

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))

sns.distplot(lens, ax=ax1)
sns.distplot(lens2, ax=ax2)

In [35]:
test_ds = QuoraSentences(test.dropna(), tokenizer, train=False)
test_dl = DataLoader(test_ds, batch_size=100, collate_fn=collate_fn_test, num_workers=3)

ans = []
def prepare_submission():
    test_ds = QuoraSentences(test.dropna(), tokenizer, train=False)
    test_dl = DataLoader(test_ds, batch_size=100, collate_fn=collate_fn_test, num_workers=3)
    
    sc.clf.eval()
    for q1, m1, q2, m2 in tqdm(test_dl, position=0):
        with torch.no_grad():
            ans.append(sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda()).softmax(dim=1)[:, 1].cpu())
    test['is_duplicate'] = 0
    res_cpu = torch.cat(ans)
    test.loc[test.dropna().index, 'is_duplicate'] = res_cpu.tolist()
prepare_submission()

100%|██████████| 23458/23458 [2:53:05<00:00,  2.26it/s]  


In [37]:
test[['test_id', 'is_duplicate']].to_csv('sub_1.csv', index=None)

In [38]:
pd.read_csv('sub_1.csv', nrows=10)

Unnamed: 0,test_id,is_duplicate
0,0,0.042836
1,1,0.192015
2,2,0.326681
3,3,0.136623
4,4,0.215123
5,5,0.447753
6,6,0.87828
7,7,0.987539
8,8,0.580995
9,9,0.058586


## NEW

In [6]:
def cache_ds(df, save=None, train=True, tokenizer):
    def process(str_1, str_2):
        max_len=128
        inputs = tokenizer.encode_plus(str_1, str_2, max_length=max_len, add_special_tokens=True)
        pad_token_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
        pad_token_segment_id = 0
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        pad_len = max_len - len(input_ids)
        attn_mask = [1] * len(input_ids)
        input_ids += [pad_token_id] * pad_len
        attn_mask += [0] * pad_len
        token_type_ids += [pad_token_segment_id] * pad_len
        return input_ids, attn_mask, token_type_ids

    id_list = []
    qid1_list = []
    qid2_list = []
    input_id_list = []
    attn_mask_list = []
    token_type_ids_list = []
    
    if train:
        is_duplicate_list = []

        for id_, (qid1, qid2, question1, question2, is_dup) in tqdm(df.iterrows(), desc='Progress', position=0):
            input_ids, attn_mask, token_type_ids = process(question1, question2)
            id_list.append(id_)
            qid1_list.append(qid1)
            qid2_list.append(qid2)
            input_id_list.append(input_ids)
            attn_mask_list.append(attn_mask)
            token_type_ids_list.append(token_type_ids)
            is_duplicate_list.append(is_dup)

        ds = torch.utils.data.TensorDataset(
            torch.tensor(id_list),
            torch.tensor(qid1_list),
            torch.tensor(qid2_list),
            torch.tensor(input_id_list),
            torch.tensor(attn_mask_list),
            torch.tensor(token_type_ids_list),
            torch.tensor(is_duplicate_list),
        )
    
    else:
        for id_, (question1, question2) in tqdm(df.iterrows(), desc='Progress', position=0):
            input_ids, attn_mask, token_type_ids = process(question1, question2)
            id_list.append(id_)
            input_id_list.append(input_ids)
            attn_mask_list.append(attn_mask)
            token_type_ids_list.append(token_type_ids)

        ds = torch.utils.data.TensorDataset(
            torch.tensor(id_list),
            torch.tensor(input_id_list),
            torch.tensor(attn_mask_list),
            torch.tensor(token_type_ids_list),
        )
    
    if save is not None:
        torch.save(ds, save)
    return ds

In [71]:
cache_ds(train, './data/train_ds_CASED_cached')

NameError: name 'train' is not defined

In [7]:
train_tensor_data = torch.load('./data/train_ds_CASED_cached')

In [8]:
train_ds, val_ds = torch.utils.data.random_split(train_tensor_data, [len(train_tensor_data) - 10000, 10000])

In [57]:
sum(nums_train) / len(nums_train)

0.368995173566463

In [59]:
sum(nums) / len(nums)

0.3773

In [10]:
dl_train = DataLoader(train_ds, batch_size=32, sampler=train_sampler)
dl_val = DataLoader(val_ds, batch_size=8, sampler=val_sampler)

In [12]:
n_epochs = 3
t_total = n_epochs * len(dl_train)



optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=t_total)

In [13]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

In [14]:
def evaluate(model, dl_val):
    loss = 0
    steps = 0
    preds = None
    labels = None
    for batch in dl_val:
        model.eval()
        with torch.no_grad():
            tup = tuple(item.cuda() for item in batch[3:])
            model_input = dict(zip(['input_ids', 'attention_mask', 'token_type_ids', 'labels'], tup))
            logloss, logits = model(**model_input)[:2]
            loss += logloss.mean().item()
        steps+=1
        
        if preds is None:
            preds = logits.detach().cpu().numpy()
            labels = model_input['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            labels = np.append(labels, model_input['labels'].detach().cpu().numpy(), axis=0)
    
    y_pred = np.argmax(preds, axis=1)
    keys = ['logloss', 'accuracy', 'f1']
    ll = logloss / steps
    accuracy = accuracy_score(labels, y_pred)
    f1 = f1_score(labels, y_pred)
    return zip(keys, (ll, accuracy, f1))

In [None]:
max_grad_norm = 1
global_step = 0
acc_loss = 0.0
logging_loss = acc_loss
model.zero_grad()
log_step = 500
max_steps = 15000

epoch_range = trange(n_epochs, desc='Epoch', position=0, leave=True)

for epoch_num in epoch_range:
    epoch_iter = tqdm(dl_train, desc='Inside epoch {}'.format(epoch_num), position=1, leave=True)
    for step, batch in enumerate(epoch_iter):
        model.train()
        
        tup = tuple(item.cuda() for item in batch[3:])
        model_input = dict(zip(['input_ids', 'attention_mask', 'token_type_ids', 'labels'], tup))
        logloss, logits = model(**model_input)[:2]
        
        logloss.backward()
        acc_loss += logloss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        global_step += 1
        
        if global_step % log_step == 0:
            eval_res = evaluate(model)
            for key, value in eval_res:
                writer.add_scalar('eval_{}'.format(key), value, global_step=global_step)
            writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
            writer.add_scalar('loss', (acc_loss - logging_loss) / log_step, global_step)
            logging_loss = acc_loss
    
        if global_step >= max_steps:
            break
    
    torch.save(
        {
            'epoch': epoch_num,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
            #'loss': loss,
            'val_metric': list(evaluate(model))
        }, 'models/checkpoint_iter_{}_{}'.format(global_step, datetime.now()))
    
    if global_step >= max_steps:
        break

        
        
writer.export_scalars_to_json('./scalars_{}.json'.format(datetime.now()))
writer.close()

In [29]:
model.load_state_dict(torch.load('./models/checkpoint_iter_15000_2019-11-04 14:43:10.972407')['model_state_dict'])

<All keys matched successfully>

In [30]:
list(evaluate(model))

[('logloss', tensor(2.8814e-05, device='cuda:0')),
 ('accuracy', 0.9354),
 ('f1', 0.9105263157894737)]

In [27]:
# torch.save(
#     {
#         'epoch': global_step,
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'scheduler': scheduler.state_dict(),
#         #'loss': loss,
#         'val_metric': list(evaluate(model))
#     }, 'models/checkpoint_iter_{}_{}'.format(global_step, datetime.now()))

### TEST

In [16]:
#cached_test_ds = cache_ds(test.dropna(), save='./data/test_ds_CASED_cached', train=False)
test_ds = torch.load('./data/test_ds_CASED_cached')

In [17]:
from scipy.special import softmax

In [18]:
#torch.utils.data.WeightedRandomSampler()

In [55]:
test_ds_subset = torch.utils.data.Subset(test_ds, torch.arange(0, 10000))

In [56]:
test_sampler = torch.utils.data.SequentialSampler(test_ds_subset)
dl_test = DataLoader(test_ds, batch_size=100, sampler=test_sampler)

def process_test(model, dl_test):
    preds = []
    model.eval()
    for batch in tqdm(dl_test, desc='Test Progress', position=0):
        with torch.no_grad():
            tup = tuple(item.cuda() for item in batch[1:])
            model_input = dict(zip(['input_ids', 'attention_mask', 'token_type_ids'], tup))
            logits = model(**model_input)[0]
            
        if preds is None:
            preds = [logits.detach().cpu().softmax(axis=1).numpy()[:, 1]]
        else:
            preds.append(logits.detach().cpu().softmax(axis=1).numpy()[:, 1])
      #  print(len(preds))
    return preds

In [57]:
test_predictions = process_test(model)

HBox(children=(IntProgress(value=0, description='Test Progress', style=ProgressStyle(description_width='initia…




In [58]:
answers = np.concatenate(test_predictions)

In [59]:
answers

array([0.00547964, 0.25859526, 0.00457108, ..., 0.00125954, 0.9906185 ,
       0.11264461], dtype=float32)

In [60]:
cool_sub= pd.read_csv('./sub11.csv', index_col='test_id')

  mask |= (ar1 == a)


In [61]:
cool_sub

Unnamed: 0_level_0,is_duplicate
test_id,Unnamed: 1_level_1
0,0.004199
1,0.135937
2,0.029179
3,0.011838
4,0.025330
...,...
2345791,0.000462
2345792,0.002003
2345793,0.000318
2345794,0.013929


In [36]:
prev_answers = np.load('./test_answers.npy')
prev_answers[:10000] = answers

In [39]:
np.sum(prev_answers > 0.5)

278240

In [45]:
#np.save('./test_answers.npy', answers)

In [40]:
test['is_duplicate'] = 0
test.loc[test.dropna().index, 'is_duplicate'] = prev_answers

In [41]:
test[['is_duplicate']].to_csv('sub_15.csv')

In [24]:
second = pd.read_csv('sub_2.csv', index_col='test_id')

first = pd.read_csv('sub_1.csv', index_col='test_id')

  mask |= (ar1 == a)
