In [1]:
import torch
import os
import sys
import pandas as pd
from tqdm import tqdm

In [2]:
from transformers import transformers
from transformers.transformers import RobertaTokenizer, RobertaModel, RobertaConfig
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

In [3]:
from torch.utils.data import Dataset

In [4]:
model_weights = 'roberta-base'

In [5]:
tokenizer = RobertaTokenizer.from_pretrained(model_weights)

In [6]:
model = RobertaModel.from_pretrained(model_weights, output_hidden_states=True).cuda()

In [7]:
train = pd.read_csv('./data/train.csv')
train.dropna(axis=0, inplace=True)
test = pd.read_csv('data/test.csv')

In [8]:
def encode_str(string, *args, **kwargs):
    return tokenizer.encode(string, *args, **kwargs)

class QuoraSentences(torch.utils.data.Dataset):
    def __init__(self, df, tk, train=True):
        self.train = train
        self.df = df
        if self.train:
            self.df.dropna(inplace=True, axis=0)
        self.enc = tk.encode
    
    def __getitem__(self, idx):
        q_1, q_2 = self.df.iloc[idx][['question1', 'question2']]
        enc_1 = self.enc(q_1, add_special_tokens=True, return_tensors='pt').squeeze()        
        enc_2 = self.enc(q_2, add_special_tokens=True, return_tensors='pt').squeeze()
        if self.train:
            is_dup = self.df.iloc[idx]['is_duplicate']
            return enc_1, enc_2, is_dup
        return enc_1, enc_2
        
    def __len__(self):
        return len(self.df)

In [9]:
def collate_fn(batch):
    #calculate max length
    max1 = max([item[0].size() for item in batch])
    max2 = max([item[1].size() for item in batch])
    
    q1_batch, q1_mask, q2_batch, q2_mask = [], [], [], []
    y = []
    
    for enc_1, enc_2, is_dup in batch:
        padded_1 = enc_1.new_zeros(max1)
        padded_1[:len(enc_1)] = enc_1
        att_mask_1 = enc_1.new_zeros(max1, dtype=torch.float)
        att_mask_1[:len(enc_1)] = 1
        q1_batch.append(padded_1)
        q1_mask.append(att_mask_1)
        
        padded_2 = enc_2.new_zeros(max2)
        padded_2[:len(enc_2)] = enc_2
        att_mask_2 = enc_2.new_zeros(max2, dtype=torch.float)
        att_mask_2[:len(enc_2)] = 1
        q2_batch.append(padded_2)
        q2_mask.append(att_mask_2)
        
        y.append(is_dup)
        
    
    return torch.stack(q1_batch), torch.stack(q1_mask), torch.stack(q2_batch), torch.stack(q2_mask), torch.tensor(y)

In [10]:
class SentenceClf(torch.nn.Module):
    def __init__(self, emb_model):
        super(SentenceClf, self).__init__()
        self.emb_model = emb_model
        self.emb_size = 768
        self.clf = torch.nn.Sequential(
            torch.nn.Linear(self.emb_size * 2, 512),
            torch.nn.Dropout(),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 512),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(512, 256),
            torch.nn.LeakyReLU(),
            torch.nn.Dropout(),
            torch.nn.Linear(256, 2)
        ).cuda()
    
    def forward(self, enc_1, mask_1, enc_2, mask_2):
        #average, concatenate, process with mlp
        
        with torch.no_grad():
            hidden_1 = self.emb_model(enc_1, attention_mask = mask_1)[0]
            hidden_2 = self.emb_model(enc_2, attention_mask = mask_2)[0]

            hidden_1_count = mask_1.sum(axis=1, keepdims=True)
            hidden_2_count = mask_2.sum(axis=1, keepdims=True)
        
        #input: batch_size x word_size x embed_dim
            mlp_input = torch.cat(
                (hidden_1.sum(axis=1) / hidden_1_count, hidden_2.sum(axis=1) / hidden_2_count),
                axis = 1
            )
        
        return self.clf(mlp_input)

In [11]:
sc = SentenceClf(model)

In [12]:
ds_train = QuoraSentences(train.iloc[:10000], tokenizer)
ds_val = QuoraSentences(train.iloc[-5000:], tokenizer)

train_loader = DataLoader(ds_train, batch_size=100, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=100, collate_fn=collate_fn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [13]:
N_EPOCHS = 100

In [14]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

In [15]:
optim = Adam(sc.clf.parameters())

In [16]:
loss = CrossEntropyLoss()

In [17]:
from tensorboardX import SummaryWriter

In [18]:
writer = SummaryWriter()

In [None]:
for iter_num in tqdm(range(N_EPOCHS), position=0):
    
    val_list = []
    for q1, m1, q2, m2, target in val_loader:
        with torch.no_grad():
            outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
            val_loss = loss(outs, target.cuda()).mean().item()
            val_list.append(val_loss)
    writer.add_scalar('data/val_logloss', sum(val_list) / len(val_list), iter_num)
    
    acc_loss = 0
    n_batches = 0
    for q1, m1, q2, m2, target in train_loader:
        optim.zero_grad()
        outs = sc(q1.cuda(), m1.cuda(), q2.cuda(), m2.cuda())
        lv = loss(outs, target.cuda()).mean()
        #writer.add_scalar('data/train_logloss', lv.item(), iter_num)
        acc_loss+=lv.item()
        n_batches+=1
        lv.backward()
        optim.step()
    writer.add_scalar('data/train_logloss', acc_loss / n_batches, iter_num)
            
writer.export_scalart_to_json('./scalars.json')
writer.close()

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))

sns.distplot(lens, ax=ax1)
sns.distplot(lens2, ax=ax2)

In [53]:
mi1 = encode_str('He\'s good', add_special_tokens=True, return_tensors='pt')

mi2 = encode_str('That\'s pretty true', add_special_tokens=True, return_tensors='pt')

In [36]:
with torch.no_grad():
    output = model(mi2.cuda())

In [12]:
train = pd.read_csv('./data/train.csv')
train.dropna(axis=0, inplace=True)
test = pd.read_csv('data/test.csv')

In [185]:
tes

NameError: name 'test' is not defined

In [12]:
train.dropna(inplace=True, axis=0)

In [13]:
test[['question1', 'question2']].isna().sum()

question1    2
question2    4
dtype: int64