In [1]:
import torch
import os
import sys
import pandas as pd
from tqdm import tqdm

In [2]:
from transformers import transformers
from transformers.transformers import RobertaTokenizer, RobertaModel, RobertaConfig
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

In [3]:
from transformers.transformers import AdamW

In [4]:
from transformers.transformers import BertTokenizer, BertModel, BertConfig

In [5]:
from torch.utils.data import Dataset

In [6]:
# model_weights = 'roberta-base'
model_weights = 'bert-base-uncased'

In [7]:
# tokenizer = RobertaTokenizer.from_pretrained(model_weights)
tokenizer = BertTokenizer.from_pretrained(model_weights)

In [8]:
# model = RobertaModel.from_pretrained(model_weights, output_hidden_states=True, output_attentions=True).cuda()
model=BertModel.from_pretrained(model_weights, output_hidden_states=True).cuda()
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [9]:
train = pd.read_csv('./data/train.csv')
train.dropna(axis=0, inplace=True)
test = pd.read_csv('data/test.csv')

In [10]:
def encode_str(string, *args, **kwargs):
    return tokenizer.encode(string, *args, **kwargs)

class QuoraSentences(torch.utils.data.Dataset):
    def __init__(self, df, tk, train=True):
        self.train = train
        self.df = df
        if self.train:
            self.df.dropna(inplace=True, axis=0)
        self.enc = tk.encode
    
    def __getitem__(self, idx):
        q_1, q_2 = self.df.iloc[idx][['question1', 'question2']]
        enc_1 = self.enc(q_1.lower(), add_special_tokens=True, return_tensors='pt').squeeze()        
        enc_2 = self.enc(q_2.lower(), add_special_tokens=True, return_tensors='pt').squeeze()
        if self.train:
            is_dup = self.df.iloc[idx]['is_duplicate']
            return enc_1, enc_2, is_dup
        return enc_1, enc_2
        
    def __len__(self):
        return len(self.df)

In [11]:
def collate_fn(batch):
    #calculate max length
    max1 = max([item[0].size() for item in batch])
    max2 = max([item[1].size() for item in batch])
    
    q1_batch, q1_mask, q2_batch, q2_mask = [], [], [], []
    y = []
    
    for enc_1, enc_2, is_dup in batch:
        padded_1 = enc_1.new_zeros(max1)
        padded_1[:len(enc_1)] = enc_1
        att_mask_1 = enc_1.new_zeros(max1, dtype=torch.float)
        att_mask_1[:len(enc_1)] = 1
        q1_batch.append(padded_1)
        q1_mask.append(att_mask_1)
        
        padded_2 = enc_2.new_zeros(max2)
        padded_2[:len(enc_2)] = enc_2
        att_mask_2 = enc_2.new_zeros(max2, dtype=torch.float)
        att_mask_2[:len(enc_2)] = 1
        q2_batch.append(padded_2)
        q2_mask.append(att_mask_2)
        
        y.append(is_dup)
        
    
    return torch.stack(q1_batch), torch.stack(q1_mask), torch.stack(q2_batch), torch.stack(q2_mask), torch.tensor(y)

In [12]:
def collate_fn_test(batch):
    #calculate max length
    max1 = max([item[0].size() for item in batch])
    max2 = max([item[1].size() for item in batch])
    
    q1_batch, q1_mask, q2_batch, q2_mask = [], [], [], []
    
    for enc_1, enc_2 in batch:
        padded_1 = enc_1.new_zeros(max1)
        padded_1[:len(enc_1)] = enc_1
        att_mask_1 = enc_1.new_zeros(max1, dtype=torch.float)
        att_mask_1[:len(enc_1)] = 1
        q1_batch.append(padded_1)
        q1_mask.append(att_mask_1)
        
        padded_2 = enc_2.new_zeros(max2)
        padded_2[:len(enc_2)] = enc_2
        att_mask_2 = enc_2.new_zeros(max2, dtype=torch.float)
        att_mask_2[:len(enc_2)] = 1
        q2_batch.append(padded_2)
        q2_mask.append(att_mask_2)
    return torch.stack(q1_batch), torch.stack(q1_mask), torch.stack(q2_batch), torch.stack(q2_mask)

In [13]:
class SentenceClf(torch.nn.Module):
    def __init__(self, emb_model):
        super(SentenceClf, self).__init__()
        self.emb_model = emb_model
        self.emb_size = 768
        self.clf = torch.nn.Sequential(
            torch.nn.Linear(self.emb_size * 2, 512),
            torch.nn.Dropout(),
            torch.nn.LayerNorm(512),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 512),
            torch.nn.LayerNorm(512),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 256),
            torch.nn.LayerNorm(256),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(256, 2)
        ).cuda()
    
    def forward(self, enc_1, mask_1, enc_2, mask_2):
        #average, concatenate, process with mlp
        
        with torch.no_grad():
            hidden_1 = self.emb_model(enc_1, attention_mask = mask_1)[2][-2]
            hidden_2 = self.emb_model(enc_2, attention_mask = mask_2)[2][-2]

            hidden_1_count = mask_1.sum(axis=1, keepdims=True)
            hidden_2_count = mask_2.sum(axis=1, keepdims=True)
        
        #input: batch_size x word_size x embed_dim
            mlp_input = torch.cat(
                (hidden_1.sum(axis=1) / hidden_1_count, hidden_2.sum(axis=1) / hidden_2_count),
                axis = 1
            )
        
        return self.clf(mlp_input)

In [14]:
sc = SentenceClf(model)
sc.clf.load_state_dict(torch.load('models/clf_head_weight'))

In [19]:
ds_train = QuoraSentences(train.iloc[:-5000], tokenizer)
ds_val = QuoraSentences(train.iloc[-5000:], tokenizer)

train_loader = DataLoader(ds_train, batch_size=100, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=100, collate_fn=collate_fn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [20]:
N_EPOCHS = 100

In [21]:
from torch.optim import Adam, Adadelta, SGD
from torch.nn import CrossEntropyLoss

In [17]:
optim = Adam(sc.clf.parameters())

In [23]:
loss = CrossEntropyLoss()

In [19]:
from tensorboardX import SummaryWriter

In [20]:
writer = SummaryWriter()

In [21]:
for iter_num in tqdm(range(N_EPOCHS), position=0):
    
    sc.clf.eval()
    val_list = []
    for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)
    writer.add_scalar('data/val_logloss', sum(val_list) / len(val_list), iter_num)
    
    sc.clf.train()
    acc_loss = 0
    n_batches = 0
    for q1, m1, q2, m2, target in train_loader:
        optim.zero_grad()
        outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
        lv = loss(outs, target.cuda()).mean()
        #writer.add_scalar('data/train_logloss', lv.item(), iter_num)
        acc_loss+=lv.item()
        n_batches+=1
        lv.backward()
        optim.step()
    writer.add_scalar('data/train_logloss', acc_loss / n_batches, iter_num)
            
writer.export_scalars_to_json('./scalars.json')
writer.close()

 20%|██        | 20/100 [13:07:15<52:28:08, 2361.10s/it]

KeyboardInterrupt: 

In [24]:
val_list = []
sc.eval()
for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)

In [25]:
torch.save(sc.clf.state_dict(), './models/clf_head_weight')

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))

sns.distplot(lens, ax=ax1)
sns.distplot(lens2, ax=ax2)

In [None]:
test_ds = QuoraSentences(test.dropna(), tokenizer, train=False)
test_dl = DataLoader(test_ds, batch_size=100, collate_fn=collate_fn_test, num_workers=3)

ans = []
def prepare_submission():
    test_ds = QuoraSentences(test.dropna(), tokenizer, train=False)
    test_dl = DataLoader(test_ds, batch_size=100, collate_fn=collate_fn_test, num_workers=3)
    
    sc.clf.eval()
    for q1, m1, q2, m2 in tqdm(test_dl, position=0):
        with torch.no_grad():
            ans.append(sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda()).softmax(dim=1)[:, 1].cpu())
    test['is_duplicate'] = 0
    res_cpu = torch.cat(ans)
    test.loc[test.dropna().index, 'is_duplicate'] = res_cpu.tolist()
prepare_submission()

  1%|▏         | 339/23458 [02:27<3:08:01,  2.05it/s]

In [152]:
test[['test_id', 'is_duplicate']].to_csv('sub.csv', index=None)

In [155]:
pd.read_csv('sub.csv', nrows=10)

Unnamed: 0,test_id,is_duplicate
0,0,0.957164
1,1,0.807985
2,2,0.673319
3,3,0.863377
4,4,0.784877
5,5,0.552247
6,6,0.87828
7,7,0.987539
8,8,0.580995
9,9,0.941414
