# Imports

In [26]:
!pip install transformers
#VERSION = "20200325"  #@param ["1.5" , "20200325", "nightly"]
#!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
#!python pytorch-xla-env-setup.py --version $VERSION

from google.colab import drive
drive.mount('/content/drive')

import os, time
import pandas as pd
from transformers import BertModel, BertTokenizer
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.metrics import roc_auc_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Set global variables

In [27]:
SEED = 42
MAX_LENGTH = 128
EPOCHS = 30
GPU_USE = True
PRINT_EVERY = 100

DATA_PATH =  "./drive/My Drive/toxic_jigsaw/"#toxic_data
CHECKPOINT_PATH = 'best_model.dat'
SST_TRAIN = "jigsaw_train_merged_shortest.csv"#"jigsaw_train_merged.csv"#"jigsaw-toxic-comment-train.csv"
SST_VALID = "validation.csv"
SST_TEST = "test.csv"

SENTENCE_LABEL = 'comment_text'#'sentence' # comment_text
TEST_LABEL = 'content'
TARGET_LABEL = 'toxic' # toxic

TOKENIZER_CLS=BertTokenizer
MODEL_CLS=BertModel
SHORTCUT_NAME='bert-base-multilingual-cased'

# Fix seed

In [28]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

# Examples

Load and look at examples from [our first competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/). These are comments from Wikipedia with a variety of annotations (toxic, obscene, threat, etc).

In [29]:
#train = pd.read_csv(os.path.join(DATA_PATH, SST_TRAIN))
#valid = pd.read_csv(os.path.join(DATA_PATH, SST_VALID))
test = pd.read_csv(os.path.join(DATA_PATH, SST_TEST))
test.head()

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr


# Dataset Loader

In [30]:
"""class ToxicDataset(Dataset):

    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename)
        self.tokenizer = TOKENIZER_CLS.from_pretrained(SHORTCUT_NAME)
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, SENTENCE_LABEL]
        label = self.df.loc[index, TARGET_LABEL]
        
        encoded = self.tokenizer.encode_plus(
            sentence, 
            add_special_tokens=True, 
            max_length=MAX_LENGTH, 
            pad_to_max_length=True,
            return_tensors="pt"
        )
        return encoded['input_ids'], encoded['attention_mask'], label


class ToxicTestDataset(Dataset):

    def __init__(self, filename, maxlen):
        self.df = pd.read_csv(filename)
        self.tokenizer = TOKENIZER_CLS.from_pretrained(SHORTCUT_NAME)
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, TEST_LABEL]
        encoded = self.tokenizer.encode_plus(
            sentence, 
            add_special_tokens=True, 
            max_length=MAX_LENGTH, 
            pad_to_max_length=True,
            return_tensors="pt"
        )
        return encoded['input_ids'], encoded['attention_mask']
"""
class ToxicDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename)

        #Initialize the BERT tokenizer
        self.tokenizer = TOKENIZER_CLS.from_pretrained(SHORTCUT_NAME)

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, SENTENCE_LABEL]
        label = self.df.loc[index, TARGET_LABEL]
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor
        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

class ToxicTestDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename)

        #Initialize the BERT tokenizer
        self.tokenizer = TOKENIZER_CLS.from_pretrained(SHORTCUT_NAME)

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, TEST_LABEL]
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor
        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
        return tokens_ids_tensor, attn_mask

# Model with Bert

In [31]:
class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = MODEL_CLS.from_pretrained(SHORTCUT_NAME)
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

# Training

In [32]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def get_auc_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    try:
      auc = roc_auc_score(labels.cpu().detach().numpy(), probs.squeeze().cpu().detach().numpy())
    except ValueError:
      auc = 0
    return auc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_auc, mean_loss = 0, 0, 0
    count = 0

    with torch.no_grad():
      for seq, attn_masks, labels in dataloader:
        seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
        #seq, attn_masks, labels = seq, attn_masks, labels
        logits = net(seq, attn_masks)
        mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
        mean_acc += get_accuracy_from_logits(logits, labels)
        mean_auc += get_auc_from_logits(logits, labels)
        count += 1

    return mean_acc / count, mean_auc / count, mean_loss / count


def predict_submit(net, test_loader):
  net.eval()
  with torch.no_grad():
    result = torch.tensor([])
    for seq, attn_masks in test_loader:
      seq, attn_masks = seq.cuda(), attn_masks.cuda()
      logits = net(seq, attn_masks)
      probs = torch.sigmoid(logits.unsqueeze(-1)).squeeze().cpu().detach()
      result = torch.cat((result, probs))
  return result


def train(net, criterion, opti, train_loader, val_loader, max_eps, gpu, print_every):
    best_auc = 0
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()
            #seq, attn_masks, labels = seq, attn_masks, labels

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % print_every == 0:
                #acc = get_accuracy_from_logits(logits, labels)
                auc = get_auc_from_logits(logits, labels)
                print(f"Iteration {it+1} of epoch {ep+1} complete. Loss : {loss.item()} AUC : {auc}")
         
        val_acc, val_auc, val_loss = evaluate(net, criterion, val_loader, gpu)
        print("Epoch {} complete! Validation AUC : {}, Validation Loss : {}".format(ep, val_auc, val_loss))
        if val_auc > best_auc:
            print("Best validation accuracy improved from {} to {}, saving model...".format(best_auc, val_auc))
            predicted = predict_submit(net, test_loader).numpy()
            subm = pd.read_csv(os.path.join(DATA_PATH, "sample_submission.csv"))
            subm['toxic'] = predicted
            subm.to_csv(os.path.join(DATA_PATH, f'submission_ep_{ep}_val_{int(val_auc*100)}.csv'), index = None)
            best_auc = val_auc
            torch.save(net.state_dict(), os.path.join(DATA_PATH, CHECKPOINT_PATH))

In [33]:
#Creating instances of training and validation set
train_set = ToxicDataset(filename = os.path.join(DATA_PATH, SST_TRAIN), maxlen = MAX_LENGTH)
val_set = ToxicDataset(filename = os.path.join(DATA_PATH, SST_VALID), maxlen = MAX_LENGTH)
test_set = ToxicTestDataset(filename = os.path.join(DATA_PATH, SST_TEST), maxlen = MAX_LENGTH)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 64, num_workers = 5)
test_loader = DataLoader(test_set, batch_size = 64, num_workers = 5)

net = SentimentClassifier(freeze_bert = True)
net.cuda()
criterion = nn.BCEWithLogitsLoss()
#opti = optim.AdamW(net.parameters(), lr = 2e-5, weight_decay=0.01)
opti = optim.Adam(net.parameters(), lr = 2e-5)

train(net, criterion, opti, train_loader, val_loader, EPOCHS, GPU_USE, PRINT_EVERY)

checkpoint = torch.load(CHECKPOINT_PATH, map_location=torch.device('cpu'))
net.load_state_dict(checkpoint);

checkpoint = None
del checkpoint
#predicted = predict_submit(net, test_loader).numpy()
#print('AUCC', roc_auc_score(valid['label'].values, predicted))
predicted = predict_submit(net, test_loader).numpy()
subm = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
subm['toxic'] = predicted
subm.to_csv('submission.csv', index = None)

Iteration 100 of epoch 1 complete. Loss : 0.5465466976165771 AUC : 0.6451612903225807
Iteration 200 of epoch 1 complete. Loss : 0.5493518710136414 AUC : 0.6681749622926094
Iteration 300 of epoch 1 complete. Loss : 0.48416227102279663 AUC : 0.696969696969697
Iteration 400 of epoch 1 complete. Loss : 0.4484972357749939 AUC : 0.5647321428571428


KeyboardInterrupt: ignored