[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

# Import Libraries

At the time of our work, we used the following library versions
- numpy 1.18.1
- pandas 1.0.1
- torch 1.2.0
- Cuda 10.0
- python 3.7.0
- sklearn 0.22.1
- tqdm 4.42.1
- nltk 3.4.5

In [0]:
!git clone https://github.com/cozek/trac2020_submission

In [0]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers/

In [0]:
import sys
sys.path.append('/content/trac2020_submission/src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace
import re
from collections import Counter

In [0]:
import utils.general as general_utils
import utils.trac2020 as trac_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere() #set the seed for reproducibility

In [0]:
import logging
logging.basicConfig(level=logging.INFO) 

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.__version__

## Import Optimzer and XLM Models

In [0]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead

In [0]:
from transformers import XLMRobertaTokenizer, XLMRobertaModel

# Set up the argspace/important_variables
Please note that performance is suseptible to hyper parameters. We used a Nvidia Tesla V100 32GB. If you lower the batch size or change any other parameters, modules to fit your machine, you might not get the same performance as reported in our paper.


In [0]:
args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 20,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_csv = '/content/trac2020_submission/data/hin/trac2_hin_train.csv',
        dev_csv = '/content/trac2020_submission/data/hin/trac2_hin_dev.csv',
        test_csv = '/content/trac2020_submission/data/test/trac2_hin_test.csv',
    
        #directory to save our models at
        directory = './', 
        model_name = 'xlmrobeta_hin_b.pt',
)

## Load the data csv into DataFrames

In [0]:
raw_train_df =  pd.read_csv(args.train_csv)
raw_train_df['split'] = 'train'
print(raw_train_df.columns)
print(raw_train_df['Sub-task A'].value_counts())
print(raw_train_df['Sub-task B'].value_counts())
print(f"Size of 'train' split: {len(raw_train_df)}")

In [0]:
raw_dev_df =  pd.read_csv(args.dev_csv)
raw_dev_df['split'] = 'dev'
print(raw_dev_df.columns)
print(raw_dev_df['Sub-task A'].value_counts())
print(raw_dev_df['Sub-task B'].value_counts())
print(f"Size of 'dev' split: {len(raw_dev_df)}")

In [0]:
# Concatinate both train and dev dfs together
data_df = pd.concat([raw_dev_df, raw_train_df], ignore_index= True)
data_df

### Samples given per label size

In [0]:
print(f'Total dev + train size = {len(data_df)}\n')
print(data_df['Sub-task A'].value_counts(),'\n')
print(data_df['Sub-task B'].value_counts(),'\n')

### Map to labels to integer

In [0]:
task_b_label_dict = {'NGEN':0, 'GEN':1}
print(task_b_label_dict)

### Renaming the columns for our torch dataset class

In [0]:
data_df_task_b = data_df[['ID','Text','Sub-task B','split']].copy()
data_df_task_b.columns.values[1] = 'text'
data_df_task_b.columns.values[2] = 'label'
data_df_task_b.loc[:,'label'] = data_df_task_b.loc[:,'label'].map(task_b_label_dict) 
data_df_task_b

In [0]:
print("Num samples per class")
print(data_df_task_b.label.value_counts())

print("\nNum samples per split")
print(data_df_task_b.split.value_counts())

print("\nLabel counts in dev split")
print(data_df_task_b[data_df_task_b.split=='dev'].label.value_counts())

print("\nLabel counts in train split")
print(data_df_task_b[data_df_task_b.split=='train'].label.value_counts())

### We split long samples into multiple samples
Each sample produced from from a single split will have the label of the original sample

In [0]:
#split long sentences into sentences of 200 words
data_df_task_b['text'] = data_df_task_b['text'].map(lambda x: trac_utils.chunk_sent(x,150,50))
exploded_df = data_df_task_b.explode('text').reset_index()

#### Notice how a single sample is split into two samples in the exploded_df

In [0]:
data_df_task_b[data_df_task_b.ID=='C7.849']

In [0]:
exploded_df[exploded_df.ID == 'C7.849']

In [0]:
print("Samples before splitting")
print(data_df_task_b.split.value_counts())

print("\nSamples After splitting")
print(exploded_df.split.value_counts())

## Create the text preprocessor

In [0]:
class RobertaPreprocessor():
    """
    Preprocessor for adding special tokens into each sample
    NOTE: Doesn't work perfectly.
    """
    
    
    def __init__(self,transformer_tokenizer,sentence_detector):
        """
        Args:
            transformer_tokenizer: Tokenizer for the transformer model
            sentence_detector: Sentence tokenizer.
        """
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector
        self.bos_token = transformer_tokenizer.bos_token
        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '
        
    def add_special_tokens(self, text):
        """
        Adds '</s>' between each sentence and at the end of the sample.
        Adds '<s>' at the start of the sentence.
        
        Args:
            text: Text sample to add special tokens into
        Returns:
            text with special tokens added
        """
        text = ' '.join(text.strip().split()) #clean whitespaces
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text  = self.sep_token.join(sentences) 
        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token

In [0]:
!python -c 'import nltk; nltk.download("punkt")'

In [0]:
xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
roberta_preproc = RobertaPreprocessor(xlmroberta_tokenizer, punkt_sentence_detector)

In [0]:
#apply the preprocessor on the exploded dataframe
exploded_df['text'] = exploded_df['text'].map(roberta_preproc.add_special_tokens)

In [0]:
exploded_df.loc[2].text #notice the addition of eos token

### Create the Vectorizer and the torch Dataset

In [0]:
class SimpleVectorizer():
    """Vectorizes Class to encode the samples into 
    their token ids and creates their respective attention masks
    """
    
    def __init__(self,tokenizer: Callable, max_seq_len: int):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str):
        """
        Args:
            text: Text sample to vectorize
        Returns:
            ids: Token ids of the 
            attn: Attention masks for ids 
        """
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preprocessor
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        
        return ids, attn

In [0]:
class TracDataset(Dataset):
    """PyTorch dataset class"""
    def __init__(
        self,
        data_df: pd.DataFrame,
        tokenizer: Callable,
        max_seq_length:int = None,
    ):
        """
        Args:
            data_df (pandas.DataFrame): df containing the labels and text
            tokenizer (Callable): tokenizer for the transformer
            max_seq_length (int): Maximum sequece length to work with.
        """
        self.data_df = data_df
        self.tokenizer = tokenizer

        if max_seq_length is None:
            self._max_seq_length = self._get_max_len(data_df,tokenizer)
        else:
            self._max_seq_length = max_seq_length

        self.train_df = self.data_df[self.data_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.data_df[self.data_df.split == 'dev']
        self.val_size = len(self.val_df)

        self.test_df = self.data_df[self.data_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.val_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

    
    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
        """Get the maximum lenght found in the data
        Args:
            data_df (pandas.DataFrame): The pandas dataframe with the data
            tokenizer (Callable): The tokenizer of the transformer
        Returns:
            max_len (int): Maximum length
        """
        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])
        max_len = data_df.text.map(len_func).max() 
        return max_len

    
    def set_split(self, split="train"):
        """selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    
    def __len__(self):
        return self._target_size
    
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        
        indices, attention_masks = self._simple_vectorizer.vectorize(row.text)


        label = row.label
        return {'x_data': indices,
                'x_attn_mask': attention_masks,
                'x_index': index,
                'y_target': label}
    
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [0]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        out_data_dict['x_data'] = data_dict['x_data'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_index'] = data_dict['x_index']
        out_data_dict['y_target'] = data_dict['y_target'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        yield out_data_dict

## Initialize the dataset

In [0]:
dataset = TracDataset(
    data_df = exploded_df,
    tokenizer = xlmroberta_tokenizer,
    max_seq_length = 403 #what we used
)

In [0]:
dataset._max_seq_length # make sure its safe enough for our model, i,e, < 512

# Creating the XLMRoberta + Attention model

In [0]:
class XLMRoBertAttention(nn.Module):
    """Implements Attention Head Classifier
    on Pretrained Roberta Transformer representations.
    Attention Head Implementation : https://www.aclweb.org/anthology/P16-2034/
    """
    
    def penalized_tanh(self,x):
        """
        http://aclweb.org/anthology/D18-1472
        """
        alpha = 0.25
        return torch.max(torch.tanh(x), alpha*torch.tanh(x))
    
    
    def __init__(self, model_name, num_labels):
        """
        Args:
            model_name: model name, eg, roberta-base'
            num_labels: number of classes to classify
        """
        super().__init__()
        self.w = nn.Linear(768,1, bias=False)
        self.bert = XLMRobertaModel.from_pretrained(model_name)
        self.prediction_layer = nn.Linear(768, num_labels)
        self.init_weights()
        
        
    def init_weights(self):
        """Initializes the weights of the Attention head classifier"""
        
        for name, param in self.prediction_layer.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
        for name, param in self.w.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
        
        
    def forward(self, input_ids,attention_mask):
        """
        Args:
            input_ids: sent encoded into indices
            attention_mask: their respective attention masks
        Returns:
            preds: Final layer output of the model
        """
        embeddings = self.bert(input_ids = input_ids,
                  attention_mask = attention_mask)
        H = embeddings[0] #final hidden layer outputs 
        M = self.penalized_tanh(H)
        alpha = torch.softmax(self.w(M), dim=1)
        r = torch.bmm(H.permute(0,2,1),alpha)
        h_star = self.penalized_tanh(r)
        preds = self.prediction_layer(h_star.permute(0,2,1))
        return preds
 

### Initializing the model

In [0]:
model = XLMRoBertAttention(
    model_name = 'xlm-roberta-base',
    num_labels = len(set(dataset.data_df.label)),
)
model.to(args.device) #send the model to the 'cpu' or 'gpu'

In [0]:
loss_func = nn.CrossEntropyLoss()
early_stopping = transformer_general_utils.EarlyStopping(patience=4)
base_optimizer = RAdam(model.parameters(), lr = args.learning_rate, weight_decay=1e-5)
optimizer = Lookahead(optimizer = base_optimizer, k = 6, alpha=0.5 )
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer.optimizer, factor =0.1 ,mode='max')

print(f'Using LR:{args.learning_rate}\n Early Stopping Patience: 4')

# Begin Training

In [0]:
train_state = general_utils.make_train_state() #dictionary for saving training routine information
train_state.keys()

In [0]:
!nvidia-smi

In [0]:
args.batch_size = 16 #based on your hardware. 1GB per batch.

In [0]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 3, 
    )

    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        y_pred = model(
            input_ids = batch_dict['x_data'],
            attention_mask =  batch_dict['x_attn_mask'],
        )
        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
                             
        loss = loss_func(y_pred, batch_dict['y_target'])
    
        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
                             
        y_pred = y_pred.detach().cpu()
        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils \
            .compute_accuracy(y_pred, batch_dict['y_target'])

        f1_t = transformer_general_utils \
            .compute_macro_f1(y_pred, batch_dict['y_target'], average='weighted')

        train_state['batch_preds'].append(y_pred)
        train_state['batch_targets'].append(batch_dict['y_target'])
        train_state['batch_indexes'].append(batch_dict['x_index'])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                  'weighted'
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = False, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))

            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'],
                                 average='weighted')

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'])
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                  average='weighted',
                                 )
                                 
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    scheduler.step(val_f1)
    early_stopping(val_f1, model)
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()    
    
    if early_stopping.early_stop:
            print("Early stopping")
            break
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1 )
    epoch_bar.update()

In [0]:
print(train_state['train_f1s'])

In [0]:
print(train_state['val_f1s'])

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [0]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print(f'Best run at epoch {best_run_index}')
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

## Check if ensembling helps and pick models to use on test set

In [0]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

def get_optimal_models(train_state, split, reverse=False ):
    """Naive Ensembling"""
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    rng = range(0,total_preds)
    if reverse:
        rng = reversed(rng)
    for i in rng:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='weighted'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return (idxes,max_f1)

In [0]:
train_state['val_f1s']

In [0]:
best_model_f1_score = f1_score(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    average='weighted'
)
_models= [get_optimal_models(train_state,'val', reverse=False),
                 get_optimal_models(train_state,'val', reverse=True),
                 ([best_run_index],best_model_f1_score),]
optimal_models = max(_models, key=lambda x:x[1]) #select ensembles or best model 
print(f'Optimal models chosen: {optimal_models}')

In [0]:
!ls {args.directory}

In [0]:
all_models= [os.path.join(args.directory,i) for i in os.listdir(args.directory) if args.model_name in i]
all_models = sorted(all_models, key = lambda x: int(x[8])) #sort by epoch num.
all_models

In [0]:
selected_models = [all_models[i] for i in optimal_models[0]]
pprint.pprint(selected_models)

## Loading test set


In [0]:
test_set_loc = '/content/trac2020_submission/data/test/trac2_hin_test.csv'

In [0]:
test_df = pd.read_csv(test_set_loc)

In [0]:
test_df['text'] = test_df['Text'].map(roberta_preproc.add_special_tokens)

In [0]:
test_df['split'] = 'test'  #dummy label
test_df['label'] = -1  #dummy label


In [0]:
test_df

In [0]:
test_dataset = TracDataset(
    data_df = test_df,
    tokenizer = xlmroberta_tokenizer,
    max_seq_length = dataset._max_seq_length
)

In [0]:
test_dataset.set_split('test')


In [0]:
test_dataset._target_df.split.value_counts()

In [0]:
test_state = general_utils.make_train_state() 
test_dataset.set_split('test')
eval_bar = notebook.tqdm(
    desc = 'split=train ',
    total=test_dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
model.eval()
for m in notebook.tqdm(selected_models, total=len(selected_models)):
    eval_bar.reset(
        total=test_dataset.get_num_batches(args.batch_size),
    )
    model.load_state_dict(torch.load(m)['model'])
    batch_generator = generate_batches(
        dataset= test_dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 1, 
    )
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            
            y_pred = y_pred.detach()
            
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            test_state['batch_preds'].append(y_pred.cpu())
            test_state['batch_targets'].append(batch_dict['y_target'].cpu())
            test_state['batch_indexes'].append(batch_dict['x_index'].cpu())
            eval_bar.update()

    test_state['val_preds'].append(
        torch.cat(test_state['batch_preds']).cpu()
    )
    test_state['val_targets'].append(
        torch.cat(test_state['batch_targets']).cpu()
    )
    test_state['val_indexes'].append(
        torch.cat(test_state['batch_indexes']).cpu()
    )
    
    test_state['batch_preds'] = []
    test_state['batch_targets'] = []
    test_state['batch_indexes'] = []


In [0]:
assert len(test_state['val_preds']) == len(optimal_models[0])

### Add the last layer outputs and apply argmax 

In [0]:
ensemble = torch.zeros_like(test_state['val_preds'][-1])
for i in test_state['val_preds']:
    ensemble += i

In [0]:
test_preds = torch.argmax(ensemble, dim=1).tolist()

In [0]:
collections.Counter(test_preds)

In [0]:
# task_b_label_dict = {'NGEN':0, 'GEN':1} #ref Reading TRAC2020 data... ipynb
int_to_label = {0:'NGEN', 1:'GEN'}
pred_labels = [int_to_label[i] for i in test_preds]
collections.Counter(pred_labels)

In [0]:
pred_df = pd.DataFrame( data= {'id':test_df.ID, 'label':pred_labels})

In [0]:
pred_analysis_df = pd.DataFrame( data= {'id':test_df.ID, 'text':test_df.Text ,'label':pred_labels})

In [0]:
pred_df

In [0]:
pred_analysis_df