[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

# Import Libraries

At the time of our work, we used the following library versions
- numpy 1.18.1
- pandas 1.0.1
- torch 1.2.0
- Cuda 10.0
- python 3.7.0
- sklearn 0.22.1
- tqdm 4.42.1
- nltk 3.4.5

In [2]:
import sys
sys.path.append('../src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace
import re
from collections import Counter

In [3]:
import utils.general as general_utils
import utils.trac2020 as trac_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere() #set the seed for reproducibility

In [5]:
import logging
logging.basicConfig(level=logging.INFO) 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.__version__


'1.1.0'

## Import Optimzer and XLM Models

In [7]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead

In [32]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

I0321 13:53:33.369260 4454915520 file_utils.py:41] PyTorch version 1.1.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Set up the argspace/important_variables

In [7]:
args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 20,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_csv = '../data/hin/trac2_hin_train.csv',
        dev_csv = '../data/hin/trac2_hin_dev.csv',
        test_csv = '../data/test/trac2_hin_test.csv',
    
        #directory to save our models at
        directory = './' 
)

## Load the data csv into DataFrames

In [8]:
raw_train_df =  pd.read_csv(args.train_csv)
raw_train_df['split'] = 'train'
print(raw_train_df.columns)
print(raw_train_df['Sub-task A'].value_counts())
print(raw_train_df['Sub-task B'].value_counts())
print(f"Size of 'train' split: {len(raw_train_df)}")

Index(['ID', 'Text', 'Sub-task A', 'Sub-task B', 'split'], dtype='object')
NAG    2245
OAG     910
CAG     829
Name: Sub-task A, dtype: int64
NGEN    3323
GEN      661
Name: Sub-task B, dtype: int64
Size of 'train' split: 3984


In [9]:
raw_dev_df =  pd.read_csv(args.dev_csv)
raw_dev_df['split'] = 'dev'
print(raw_dev_df.columns)
print(raw_dev_df['Sub-task A'].value_counts())
print(raw_dev_df['Sub-task B'].value_counts())
print(f"Size of 'dev' split: {len(raw_dev_df)}")

Index(['ID', 'Text', 'Sub-task A', 'Sub-task B', 'split'], dtype='object')
NAG    578
CAG    211
OAG    208
Name: Sub-task A, dtype: int64
NGEN    845
GEN     152
Name: Sub-task B, dtype: int64
Size of 'dev' split: 997


In [10]:
# Concatinate both train and dev dfs together
data_df = pd.concat([raw_dev_df, raw_train_df], ignore_index= True)
data_df

Unnamed: 0,ID,Text,Sub-task A,Sub-task B,split
0,C38.9,bkl interviewers kuch jaada hi open minded bnt...,OAG,NGEN,dev
1,C4.1510,Bhaiya shaadi mein zaroor aana movie ka plot j...,NAG,NGEN,dev
2,C19.95,Section 375 hai kya??? .... Ye to batate kam s...,NAG,NGEN,dev
3,C4.281,कबीर सिंह hit Hui इससे पता चलता है आजकल के लोग...,OAG,NGEN,dev
4,C4.216,Maine itni kam dislike kbhi nhii dekhi,NAG,NGEN,dev
...,...,...,...,...,...
4976,C38.455,Asexual h.. bisexual... homosexual... bhai ase...,NAG,NGEN,train
4977,C4.203,Video pura dekne ke pahile hi mai bhai ke vide...,NAG,NGEN,train
4978,C45.709,konsa place hai bhai ...nam bolo,NAG,NGEN,train
4979,C4.420.1,Kuch zada hi likh diya 🙄,NAG,NGEN,train


### Samples given per label size

In [11]:
print(f'Total dev + train size = {len(data_df)}\n')
print(data_df['Sub-task A'].value_counts(),'\n')
print(data_df['Sub-task B'].value_counts(),'\n')

Total dev + train size = 4981

NAG    2823
OAG    1118
CAG    1040
Name: Sub-task A, dtype: int64 

NGEN    4168
GEN      813
Name: Sub-task B, dtype: int64 



### Map to labels to integer

In [12]:
task_b_label_dict = {'NGEN':0, 'GEN':1}
print(task_b_label_dict)

{'NGEN': 0, 'GEN': 1}


### Renaming the columns for our torch dataset class

In [13]:
data_df_task_b = data_df[['ID','Text','Sub-task B','split']].copy()
data_df_task_b.columns.values[1] = 'text'
data_df_task_b.columns.values[2] = 'label'
data_df_task_b.loc[:,'label'] = data_df_task_b.loc[:,'label'].map(task_b_label_dict) 
data_df_task_b

Unnamed: 0,ID,text,label,split
0,C38.9,bkl interviewers kuch jaada hi open minded bnt...,0,dev
1,C4.1510,Bhaiya shaadi mein zaroor aana movie ka plot j...,0,dev
2,C19.95,Section 375 hai kya??? .... Ye to batate kam s...,0,dev
3,C4.281,कबीर सिंह hit Hui इससे पता चलता है आजकल के लोग...,0,dev
4,C4.216,Maine itni kam dislike kbhi nhii dekhi,0,dev
...,...,...,...,...
4976,C38.455,Asexual h.. bisexual... homosexual... bhai ase...,0,train
4977,C4.203,Video pura dekne ke pahile hi mai bhai ke vide...,0,train
4978,C45.709,konsa place hai bhai ...nam bolo,0,train
4979,C4.420.1,Kuch zada hi likh diya 🙄,0,train


In [14]:
print("Num samples per class")
print(data_df_task_b.label.value_counts())

print("\nNum samples per split")
print(data_df_task_b.split.value_counts())

print("\nLabel counts in dev split")
print(data_df_task_b[data_df_task_b.split=='dev'].label.value_counts())

print("\nLabel counts in train split")
print(data_df_task_b[data_df_task_b.split=='train'].label.value_counts())

Num samples per class
0    4168
1     813
Name: label, dtype: int64

Num samples per split
train    3984
dev       997
Name: split, dtype: int64

Label counts in dev split
0    845
1    152
Name: label, dtype: int64

Label counts in train split
0    3323
1     661
Name: label, dtype: int64


### We split long samples into multiple samples
Each sample produced from from a single split will have the label of the original sample

In [15]:
#split long sentences into sentences of 200 words
data_df_task_b['text'] = data_df_task_b['text'].map(lambda x: trac_utils.chunk_sent(x,150,50))
exploded_df = data_df_task_b.explode('text').reset_index()

#### Notice how a single sample is split into two samples in the exploded_df

In [23]:
data_df_task_b[data_df_task_b.ID=='C7.849']

Unnamed: 0,ID,text,label,split
1706,C7.849,[**प्रशासक समिति✊🚩** 😡😡😡😡😡😡😡😡😡 **आर्यो को आंतक...,0,train


In [22]:
exploded_df[exploded_df.ID == 'C7.849']

Unnamed: 0,index,ID,text,label,split
1712,1706,C7.849,**प्रशासक समिति✊🚩** 😡😡😡😡😡😡😡😡😡 **आर्यो को आंतकी...,0,train
1713,1706,C7.849,"कर उन्हें दूर भी किया और अब भी कोशिश चालू है, ...",0,train


In [27]:
print("Samples before splitting")
print(data_df_task_b.split.value_counts())

print("\nSamples After splitting")
print(exploded_df.split.value_counts())

Samples before splitting
train    3984
dev       997
Name: split, dtype: int64

Samples After splitting
train    4006
dev       999
Name: split, dtype: int64


## Create the text preprocessor

In [30]:
class RobertaPreprocessor():
    """
    Preprocessor for adding special tokens into each sample
    NOTE: Doesn't work perfectly.
    """
    
    
    def __init__(self,transformer_tokenizer,sentence_detector):
        """
        Args:
            transformer_tokenizer: Tokenizer for the transformer model
            sentence_detector: Sentence tokenizer.
        """
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector
        self.bos_token = transformer_tokenizer.bos_token
        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '
        
    def add_special_tokens(self, text):
        """
        Adds '</s>' between each sentence and at the end of the sample.
        Adds '<s>' at the start of the sentence.
        
        Args:
            text: Text sample to add special tokens into
        Returns:
            text with special tokens added
        """
        text = ' '.join(text.strip().split()) #clean whitespaces
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text  = self.sep_token.join(sentences) 
        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token

In [33]:
xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

I0321 13:54:18.007902 4454915520 filelock.py:274] Lock 5198573240 acquired on /Users/cozek/.cache/torch/transformers/0c370616ddfc06067c0634160f749c2cf9d8da2c50e03a2617ce5841c8df3b1d.309f0c29486cffc28e1e40a2ab0ac8f500c203fe080b95f820aa9cb58e5b84ed.lock
I0321 13:54:18.013113 4454915520 file_utils.py:460] https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model not found in cache or force_download set to True, downloading to /Users/cozek/.cache/torch/transformers/tmpyojh5kf8


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…

I0321 13:55:51.341168 4454915520 file_utils.py:470] storing https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model in cache at /Users/cozek/.cache/torch/transformers/0c370616ddfc06067c0634160f749c2cf9d8da2c50e03a2617ce5841c8df3b1d.309f0c29486cffc28e1e40a2ab0ac8f500c203fe080b95f820aa9cb58e5b84ed
I0321 13:55:51.343251 4454915520 file_utils.py:473] creating metadata file for /Users/cozek/.cache/torch/transformers/0c370616ddfc06067c0634160f749c2cf9d8da2c50e03a2617ce5841c8df3b1d.309f0c29486cffc28e1e40a2ab0ac8f500c203fe080b95f820aa9cb58e5b84ed
I0321 13:55:51.345571 4454915520 filelock.py:318] Lock 5198573240 released on /Users/cozek/.cache/torch/transformers/0c370616ddfc06067c0634160f749c2cf9d8da2c50e03a2617ce5841c8df3b1d.309f0c29486cffc28e1e40a2ab0ac8f500c203fe080b95f820aa9cb58e5b84ed.lock
I0321 13:55:51.346597 4454915520 tokenization_utils.py:484] loading file https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model 




In [34]:
roberta_preproc = RobertaPreprocessor(xlmroberta_tokenizer, punkt_sentence_detector)

In [35]:
#apply the preprocessor on the exploded dataframe
exploded_df['text'] = exploded_df['text'].map(roberta_preproc.add_special_tokens)

In [39]:
exploded_df.loc[2].text #notice the addition of eos token

'<s> Section 375 hai kya??? </s> .... Ye to batate kam se kam </s>'

### Create the Vectorizer and the torch Dataset

In [43]:
class SimpleVectorizer():
    """Vectorizes Class to encode the samples into 
    their token ids and creates their respective attention masks
    """
    
    def __init__(self,tokenizer: Callable, max_seq_len: int):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str):
        """
        Args:
            text: Text sample to vectorize
        Returns:
            ids: Token ids of the 
            attn: Attention masks for ids 
        """
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preprocessor
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        
        return ids, attn

In [44]:
class TracDataset(Dataset):
    """PyTorch dataset class"""
    def __init__(
        self,
        data_df: pd.DataFrame,
        tokenizer: Callable,
        max_seq_length:int = None,
    ):
        """
        Args:
            data_df (pandas.DataFrame): df containing the labels and text
            tokenizer (Callable): tokenizer for the transformer
            max_seq_length (int): Maximum sequece length to work with.
        """
        self.data_df = data_df
        self.tokenizer = tokenizer

        if max_seq_length is None:
            self._max_seq_length = self._get_max_len(data_df,tokenizer)
        else:
            self._max_seq_length = max_seq_length

        self.train_df = self.data_df[self.data_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.data_df[self.data_df.split == 'dev']
        self.val_size = len(self.val_df)

        self.test_df = self.data_df[self.data_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.val_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

    
    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
        """Get the maximum lenght found in the data
        Args:
            data_df (pandas.DataFrame): The pandas dataframe with the data
            tokenizer (Callable): The tokenizer of the transformer
        Returns:
            max_len (int): Maximum length
        """
        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])
        max_len = data_df.text.map(len_func).max() 
        return max_len

    
    def set_split(self, split="train"):
        """selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    
    def __len__(self):
        return self._target_size
    
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        
        indices, attention_masks = self._simple_vectorizer.vectorize(row.text)


        label = row.label
        return {'x_data': indices,
                'x_attn_mask': attention_masks,
                'x_index': index,
                'y_target': label}
    
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [45]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        out_data_dict['x_data'] = data_dict['x_data'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_index'] = data_dict['x_index']
        out_data_dict['y_target'] = data_dict['y_target'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        yield out_data_dict

## Initialize the dataset

In [2]:
dataset = TracDataset(
    data_df = exploded_df,
    tokenizer = xlmroberta_tokenizer,
    max_seq_length = 403 #what we used
)

NameError: name 'TracDataset' is not defined

In [47]:
dataset._max_seq_length # make sure its safe enough for our model, i,e, < 512

365

# Creating the XLMRoberta + Attention model

In [34]:
class XLMRoBertAttention(nn.Module):
    """Implements Attention Head Classifier
    on Pretrained Roberta Transformer representations.
    Attention Head Implementation based on: https://www.aclweb.org/anthology/P16-2034/
    """
    
    def penalized_tanh(self,x):
        """
        http://aclweb.org/anthology/D18-1472
        """
        alpha = 0.25
        return torch.max(torch.tanh(x), alpha*torch.tanh(x))
    
    
    def __init__(self, model_name, num_labels):
        """
        Args:
            model_name: model name, eg, roberta-base'
            num_labels: number of classes to classify
        """
        super().__init__()
        self.w = nn.Linear(768,1, bias=False)
        self.bert = XLMRobertaModel.from_pretrained(model_name)
        self.prediction_layer = nn.Linear(768, num_labels)
        self.init_weights()
        
        
    def init_weights(self):
        """Initializes the weights of the Attention head classifier"""
        
        for name, param in self.prediction_layer.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
        for name, param in self.w.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
        
        
    def forward(self, input_ids,attention_mask):
        """
        Args:
            input_ids: sent encoded into indices
            attention_mask: their respective attention masks
        Returns:
            preds: Final layer output of the model
        """
        embeddings = self.bert(input_ids = input_ids,
                  attention_mask = attention_mask)
        H = embeddings[0] #final hidden layer outputs 
        M = self.penalized_tanh(H)
        alpha = torch.softmax(self.w(M), dim=1)
        r = torch.bmm(H.permute(0,2,1),alpha)
        h_star = self.penalized_tanh(r)
        preds = self.prediction_layer(h_star.permute(0,2,1))
        return preds
 

### Initializing the model

In [35]:
model = XLMRoBertAttention(
    model_name = 'xlm-roberta-base',
    num_labels = len(set(dataset.data_df.label)),
)
model.to(args.device) #send the model to the 'cpu' or 'gpu'

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json from cache at /home/kaushik.das/.cache/torch/transformers/762ddd751172e9d3229e5da17a459eee6c0dfdc237c718944d0b1a85f06c7e1e.9ba214636e460976b286b4ce15e95d778f32439e9fdd8ddae7e3784f3a7e24a2
INFO:transformers.configuration_utils:Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "model_type": "xlm-rober

In [44]:
loss_func = nn.CrossEntropyLoss()
early_stopping = transformer_general_utils.EarlyStopping(patience=4)
base_optimizer = RAdam(model.parameters(), lr = args.learning_rate, weight_decay=1e-5)
optimizer = Lookahead(optimizer = base_optimizer, k = 6, alpha=0.5 )
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer.optimizer, factor =0.1 ,mode='max')

print(f'Using LR:{args.learning_rate}\n Early Stopping Patience: 4')

Using LR:0.0001


# Begin Training

In [45]:
train_state = general_utils.make_train_state() #dictionary for saving training routine information
train_state.keys()

dict_keys(['train_preds', 'train_indexes', 'train_targets', 'train_accuracies', 'train_f1s', 'train_losses', 'val_preds', 'val_indexes', 'val_targets', 'val_accuracies', 'val_f1s', 'val_losses', 'test_preds', 'test_indexes', 'test_targets', 'test_accuracies', 'test_f1s', 'test_losses', 'batch_preds', 'batch_targets', 'batch_indexes', 'epoch_index'])

In [48]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 3, 
    )

    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        y_pred = model(
            input_ids = batch_dict['x_data'],
            attention_mask =  batch_dict['x_attn_mask'],
        )
        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
                             
        loss = loss_func(y_pred, batch_dict['y_target'])
    
        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
                             
        y_pred = y_pred.detach().cpu()
        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils \
            .compute_accuracy(y_pred, batch_dict['y_target'])
        atom
        f1_t = transformer_general_utils \
            .compute_macro_f1(y_pred, batch_dict['y_target'], average='weighted')

        train_state['batch_preds'].append(y_pred)
        train_state['batch_targets'].append(batch_dict['y_target'])
        train_state['batch_indexes'].append(batch_dict['x_index'])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                  'weighted'
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = False, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))

            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'],
                                 average='weighted')

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'])
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                  average='weighted',
                                 )
                                 
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    scheduler.step(val_f1)
    early_stopping(val_f1, model)
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()    
    
    if early_stopping.early_stop:
            print("Early stopping")
            break
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1 )
    epoch_bar.update()

HBox(children=(FloatProgress(value=0.0, description='training_routine', max=20.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='split=train ', max=125.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='split=eval', max=31.0, style=ProgressStyle(description_wi…

EarlyStopping counter: 1 out of 4
EarlyStopping counter: 2 out of 4
EarlyStopping counter: 3 out of 4
EarlyStopping counter: 4 out of 4
Early stopping


In [49]:
print(train_state['train_f1s'])

[0.7556380769461755, 0.7782906714806137, 0.8591663457360611, 0.8934117790110065, 0.9192519291763898, 0.9378599232099268, 0.947936706951426, 0.9640521740272423]


In [50]:
print(train_state['val_f1s'])

[0.7766205229619864, 0.8162086486974713, 0.8892706455609094, 0.89078298750255, 0.8827026262600857, 0.8755408456365394, 0.8892586948691369, 0.8812245500770091]


In [51]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [52]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print(f'Best run at epoch {best_run_index}')
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

Best run at epoch 3
Train:               precision    recall  f1-score   support

           0     0.9180    0.9658    0.9413      3338
           1     0.7692    0.5689    0.6540       668

    accuracy                         0.8997      4006
   macro avg     0.8436    0.7674    0.7977      4006
weighted avg     0.8932    0.8997    0.8934      4006

Dev:               precision    recall  f1-score   support

           0     0.9226    0.9574    0.9397       846
           1     0.7025    0.5556    0.6204       153

    accuracy                         0.8959       999
   macro avg     0.8125    0.7565    0.7801       999
weighted avg     0.8888    0.8959    0.8908       999



## Check if ensembling helps

In [53]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

def get_optimal_models(train_state, split, reverse=False ):
    """Naive Ensembling"""
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    rng = range(0,total_preds)
    if reverse:
        rng = reversed(rng)
    for i in rng:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='weighted'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return idxes

In [66]:
optimal_models= get_optimal_models(train_state,'val', reverse=True)


Taking preds from [7, 6, 5, 3, 2] | Dev f1:0.8969053566642802


In [60]:
!ls {args.directory}

_epoc_0_xlmroberta_attn_trac_hin_task_b.pt
_epoc_1_xlmroberta_attn_trac_hin_task_b.pt
_epoc_2_xlmroberta_attn_trac_hin_task_b.pt
_epoc_3_xlmroberta_attn_trac_hin_task_b.pt
_epoc_4_xlmroberta_attn_trac_hin_task_b.pt
_epoc_5_xlmroberta_attn_trac_hin_task_b.pt
_epoc_6_xlmroberta_attn_trac_hin_task_b.pt
_epoc_7_xlmroberta_attn_trac_hin_task_b.pt


In [64]:
all_models= [os.path.join(args.directory,i) for i in os.listdir(args.directory)]
all_models

['/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_0_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_1_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_2_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_3_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_4_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_5_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_6_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_

In [69]:
selected_models = [all_models[i] for i in optimal_models]
pprint.pprint(selected_models)

['/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_7_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_6_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_5_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_3_xlmroberta_attn_trac_hin_task_b.pt',
 '/home/kaushik.das/OffensEval2020/saved_models/Sun_15_Mar_2020/trac/task_b/xlmroberta_attn/_epoc_2_xlmroberta_attn_trac_hin_task_b.pt']


## Loading test set


In [78]:
test_set_loc = '/home/kaushik.das/OffensEval2020/data/TRAC/test/trac2_hin_test.csv'

In [71]:
test_df = pd.read_csv(test_set_loc)

In [72]:
test_df['text'] = test_df['Text'].map(roberta_preproc.add_special_tokens)

In [73]:
test_df['split'] = 'test'  #dummy label
test_df['label'] = -1  #dummy label


In [74]:
test_df

Unnamed: 0,ID,Text,text,split,label
0,C52.17,ko,<s> ko </s>,test,-1
1,C52.39,ladkiyon video,<s> ladkiyon video </s>,test,-1
2,C52.73,ki video gahrep,<s> ki video gahrep </s>,test,-1
3,C60.3,o sadharon video bhai,<s> o sadharon video bhai </s>,test,-1
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,<s> ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂...,test,-1
...,...,...,...,...,...
1195,C8.5029,aree bhenchod chup ho lodu aurt,<s> aree bhenchod chup ho lodu aurt </s>,test,-1
1196,C8.5037,abe saali bharwe itni gaand kyun fati hui he t...,<s> abe saali bharwe itni gaand kyun fati hui ...,test,-1
1197,C8.5046,chachi ji... usne ek mara lekin lerki ne jo is...,<s> chachi ji... usne ek mara lekin lerki ne j...,test,-1
1198,C8.5047,sun oye bhenkilodi itnaa maarunga saali tod du...,<s> sun oye bhenkilodi itnaa maarunga saali to...,test,-1


In [76]:
test_dataset = TracDataset(
    data_df = test_df,
    tokenizer = xlmroberta_tokenizer,
    max_seq_length = dataset._max_seq_length
)

In [77]:
test_dataset.set_split('test')


In [79]:
test_dataset._target_df.split.value_counts()

test    1200
Name: split, dtype: int64

In [82]:
test_state = general_utils.make_train_state() 
test_dataset.set_split('test')
eval_bar = notebook.tqdm(
    desc = 'split=train ',
    total=test_dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
model.eval()
for m in notebook.tqdm(selected_models, total=len(selected_models)):
    eval_bar.reset(
        total=test_dataset.get_num_batches(args.batch_size),
    )
    model.load_state_dict(torch.load(m)['model'])
    batch_generator = generate_batches(
        dataset= test_dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 1, 
    )
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            
            y_pred = y_pred.detach()
            
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            test_state['batch_preds'].append(y_pred.cpu())
            test_state['batch_targets'].append(batch_dict['y_target'].cpu())
            test_state['batch_indexes'].append(batch_dict['x_index'].cpu())
            eval_bar.update()

    test_state['val_preds'].append(
        torch.cat(test_state['batch_preds']).cpu()
    )
    test_state['val_targets'].append(
        torch.cat(test_state['batch_targets']).cpu()
    )
    test_state['val_indexes'].append(
        torch.cat(test_state['batch_indexes']).cpu()
    )
    
    test_state['batch_preds'] = []
    test_state['batch_targets'] = []
    test_state['batch_indexes'] = []


HBox(children=(FloatProgress(value=0.0, description='split=train ', max=37.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [86]:
assert len(test_state['val_preds']) == len(optimal_models)

In [88]:
[test_state['val_preds'][i].shape for i in range(len(optimal_models))]

[torch.Size([1200, 2]),
 torch.Size([1200, 2]),
 torch.Size([1200, 2]),
 torch.Size([1200, 2]),
 torch.Size([1200, 2])]

In [89]:
ensemble = torch.zeros_like(test_state['val_preds'][-1])
for i in test_state['val_preds']:
    ensemble += i

In [90]:
test_preds = torch.argmax(ensemble, dim=1).tolist()

In [91]:
collections.Counter(test_preds)

Counter({0: 664, 1: 536})

In [92]:
# task_b_label_dict = {'NGEN':0, 'GEN':1} #ref Reading TRAC2020 data... ipynb
int_to_label = {0:'NGEN', 1:'GEN'}
pred_labels = [int_to_label[i] for i in test_preds]
collections.Counter(pred_labels)

Counter({'NGEN': 664, 'GEN': 536})

In [93]:
pred_df = pd.DataFrame( data= {'id':test_df.ID, 'label':pred_labels})

In [94]:
pred_analysis_df = pd.DataFrame( data= {'id':test_df.ID, 'text':test_df.Text ,'label':pred_labels})

In [95]:
pred_df

Unnamed: 0,id,label
0,C52.17,NGEN
1,C52.39,NGEN
2,C52.73,NGEN
3,C60.3,NGEN
4,C60.43,NGEN
...,...,...
1195,C8.5029,GEN
1196,C8.5037,GEN
1197,C8.5046,GEN
1198,C8.5047,GEN


In [96]:
pred_analysis_df

Unnamed: 0,id,text,label
0,C52.17,ko,NGEN
1,C52.39,ladkiyon video,NGEN
2,C52.73,ki video gahrep,NGEN
3,C60.3,o sadharon video bhai,NGEN
4,C60.43,ba bhai kyea bola tum moza aaa giea 😌😌😌😂😂😂,NGEN
...,...,...,...
1195,C8.5029,aree bhenchod chup ho lodu aurt,GEN
1196,C8.5037,abe saali bharwe itni gaand kyun fati hui he t...,GEN
1197,C8.5046,chachi ji... usne ek mara lekin lerki ne jo is...,GEN
1198,C8.5047,sun oye bhenkilodi itnaa maarunga saali tod du...,GEN


#

In [98]:
!mkdir hindi_runs

In [100]:
pred_df.to_csv('./hindi_runs/hindi_task_b_preds.csv', index=False)

In [99]:
pred_analysis_df.to_csv('./hindi_runs/hindi_task_b_preds_analysis.csv', index=False)

In [1]:
!ls hindi_runs

hindi_task_b_preds.csv	hindi_task_b_preds_analysis.csv


In [103]:
!head  ./hindi_runs/hindi_task_a_preds.csv

id,label
C52.17,NGEN
C52.39,NGEN
C52.73,NGEN
C60.3,NGEN
C60.43,NGEN
C60.72,NGEN
C60.102,NGEN
C60.118,NGEN
C60.139,NGEN


In [121]:
with pd.option_context('display.max_colwidth', -1): 
    print(pred_analysis_df[['text','label']].sample(5))

                                                                                                                                                                                 text  \
157  bc first time kisi ko ladko k baare me sochta dekh rha hu, wrna sc st, girls\nchild aur girls se uper utha hi nhi kbhi kuch.  \nappreciate the work 👍                              
473  yeah dislikke vali vohi ldkia h jinka bhanda phoot gya😂                                                                                                                            
101  sabse jyda india me rape hota hai kyu?...kyunki sachai dabaai jati hai\n...sayad 100 mese 80 case me aap jante ho...fir bhi innocent aadmi ki izzat ki\ndachiya ud jati hai.....   
445  @silpisikha chetia it's not about some times dear wake up 76 % percent rape\ncase sirf badle k liye lagati hai larkiya                                                             
436  log dislike bhi krte hain ese contact waahhh                          