### Using pooling from bert second to last hidden state as features for classification

This notebook expores using pooling from bert second to last hidden state as features for classification. Apart from average pooling, some additional parametric pooling strategies are provided

In [1]:
import torch
import os
import sys
import pandas as pd
from tqdm.autonotebook import tqdm, trange
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset

from datetime import datetime

from transformers.transformers import AdamW, WarmupLinearSchedule
from transformers.transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from transformers import transformers
from transformers.transformers import RobertaTokenizer, RobertaModel, RobertaConfig

from layers import VectorAttention, NNAttention, Seq2SeqAttention
from utils import QuoraSentences, collate_fn, collate_fn_test, evaluate, prepare_submission
from models import SentenceClf



### Load pretrained bert and dataframe

In [8]:
# model_weights = 'roberta-base'
model_weights = 'bert-base-uncased'

In [9]:
# tokenizer = RobertaTokenizer.from_pretrained(model_weights)
tokenizer = BertTokenizer.from_pretrained(model_weights, do_lower_case=False)

In [10]:
# model = RobertaModel.from_pretrained(model_weights, output_hidden_states=True, output_attentions=True).cuda()
model=BertModel.from_pretrained(model_weights, output_hidden_states=True).cuda()
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
train = pd.read_csv('./data/train.csv', index_col='id')
train.dropna(axis=0, inplace=True)
test = pd.read_csv('data/test.csv', index_col='test_id')

  mask |= (ar1 == a)


In [12]:
#attn = VectorAttention(768)
sc = SentenceClf(model)

### Train the model

In [12]:
ds_train = QuoraSentences(train[:-5000], tokenizer)
ds_val = QuoraSentences(train.iloc[-5000:], tokenizer)

train_loader = DataLoader(ds_train, batch_size=100, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=50, collate_fn=collate_fn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [13]:
N_EPOCHS = 100

In [6]:
from torch.optim import Adam, Adadelta, SGD
from torch.nn import CrossEntropyLoss

In [7]:
from itertools import chain

In [16]:
optim = Adam(chain(sc.clf.parameters(), sc.attn_block.parameters()), lr=0.001)

In [8]:
from transformers.transformers import WarmupCosineSchedule
from torch.optim.lr_scheduler import MultiStepLR, ReduceLROnPlateau

In [18]:
scheduler = MultiStepLR(optim, milestones=[20, 40], gamma=0.5)

In [19]:
loss = CrossEntropyLoss()

In [9]:
from tensorboardX import SummaryWriter

writer = SummaryWriter()

In [24]:
#safety net
start_epoch = 0 if iter_num is None else iter_num

In [26]:
for iter_num in tqdm(range(start_epoch, N_EPOCHS), position=0):
    sc.clf.eval()
    val_list = []
    for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)
    writer.add_scalar('data/val_logloss', sum(val_list) / len(val_list), iter_num)
    
    if iter_num > 0 and iter_num % 5 == 0:
        torch.save(
        {
            'epoch': iter_num,
            'model_state_dict': sc.clf.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': loss,
            'val_metric': sum(val_list) / len(val_list)
        }, 'models/checkpoint_iter_{}_{}'.format(iter_num, datetime.now()))
    
    
    sc.clf.train()
    acc_loss = 0
    n_batches = 0
    for q1, m1, q2, m2, target in train_loader:
        optim.zero_grad()
        outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
        lv = loss(outs, target.cuda()).mean()
        #writer.add_scalar('data/train_logloss', lv.item(), iter_num)
        acc_loss+=lv.item()
        n_batches+=1
        lv.backward()
        optim.step()
    writer.add_scalar('data/train_logloss', acc_loss / n_batches, iter_num)
    
    scheduler.step()
            
writer.export_scalars_to_json('./scalars.json')
writer.close()

  "type " + obj.__name__ + ". It won't be checked "
 10%|█         | 10/100 [6:25:35<57:51:27, 2314.31s/it]

KeyboardInterrupt: 

### Process test

In [13]:
#torch.save(sc.clf.state_dict(), './models/clf_head_weight')
sc.clf.load_state_dict(torch.load('models/clf_head_weight'))

<All keys matched successfully>

In [14]:
test_ds = QuoraSentences(test.dropna(), tokenizer, train=False)
test_dl = DataLoader(test_ds, batch_size=100, collate_fn=collate_fn_test, num_workers=3)
res_cpu = prepare_submission(sc, test_dl)

HBox(children=(IntProgress(value=0), HTML(value='')))




In [None]:
test['is_duplicate'] = 0
test.loc[test.dropna().index, 'is_duplicate'] = res_cpu.tolist()

In [37]:
your_name = #INSERT SUBMISSION NAME HERE
test[['is_duplicate']].to_csv(your_name)