[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20A%20-%20Ensemble%20DistilGPT2.ipynb)

# Import Libraries

At the time of our work, we used the following library versions
- numpy 1.18.1
- pandas 1.0.1
- torch 1.2.0
- Cuda 10.0
- python 3.7.0
- sklearn 0.22.1
- tqdm 4.42.1
- nltk 3.4.5

In [1]:
!git clone https://github.com/cozek/OffensEval2020-code/

Cloning into 'OffensEval2020-code'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects:   1% (1/52)[Kremote: Counting objects:   3% (2/52)[Kremote: Counting objects:   5% (3/52)[Kremote: Counting objects:   7% (4/52)[Kremote: Counting objects:   9% (5/52)[Kremote: Counting objects:  11% (6/52)[Kremote: Counting objects:  13% (7/52)[Kremote: Counting objects:  15% (8/52)[Kremote: Counting objects:  17% (9/52)[Kremote: Counting objects:  19% (10/52)[Kremote: Counting objects:  21% (11/52)[Kremote: Counting objects:  23% (12/52)[Kremote: Counting objects:  25% (13/52)[Kremote: Counting objects:  26% (14/52)[Kremote: Counting objects:  28% (15/52)[Kremote: Counting objects:  30% (16/52)[Kremote: Counting objects:  32% (17/52)[Kremote: Counting objects:  34% (18/52)[Kremote: Counting objects:  36% (19/52)[Kremote: Counting objects:  38% (20/52)[Kremote: Counting objects:  40% (21/52)[Kremote: Counting objects:  42% (22/52)[Kremot

In [2]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers/

Cloning into 'transformers'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects:   2% (1/39)[Kremote: Counting objects:   5% (2/39)[Kremote: Counting objects:   7% (3/39)[Kremote: Counting objects:  10% (4/39)[Kremote: Counting objects:  12% (5/39)[Kremote: Counting objects:  15% (6/39)[Kremote: Counting objects:  17% (7/39)[Kremote: Counting objects:  20% (8/39)[Kremote: Counting objects:  23% (9/39)[Kremote: Counting objects:  25% (10/39)[Kremote: Counting objects:  28% (11/39)[Kremote: Counting objects:  30% (12/39)[Kremote: Counting objects:  33% (13/39)[Kremote: Counting objects:  35% (14/39)[Kremote: Counting objects:  38% (15/39)[Kremote: Counting objects:  41% (16/39)[Kremote: Counting objects:  43% (17/39)[Kremote: Counting objects:  46% (18/39)[Kremote: Counting objects:  48% (19/39)[Kremote: Counting objects:  51% (20/39)[Kremote: Counting objects:  53% (21/39)[Kremote: Counting objects:  56% (22/39)[Kremote: Coun

In [0]:
import sys
sys.path.append('/content/OffensEval2020-code/src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace
import re
from collections import Counter

In [4]:
import utils.general as general_utils
import utils.trac2020 as trac_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere() #set the seed for reproducibility

  import pandas.util.testing as tm


In [0]:
import logging
logging.basicConfig(level=logging.INFO) 

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.__version__

'1.5.0+cu101'

## Import Optimzer and XLM Models

In [0]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead

In [8]:
from transformers import GPT2Tokenizer, GPT2Model

INFO:transformers.file_utils:PyTorch version 1.5.0+cu101 available.
INFO:transformers.file_utils:TensorFlow version 2.2.0-rc3 available.


# Set up the argspace/important_variables
Please note that performance is suseptible to hyper parameters. We used a Nvidia Tesla V100 32GB. If you lower the batch size or change any other parameters, modules to fit your machine, you might not get the same performance as reported in our paper.


In [0]:
args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 20,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_val_csv = '/content/OffensEval2020-code/data/eng/task_a_tiny.csv',
        test_csv = '/content/OffensEval2020-code/data/test_data/test_a_tweets.tsv',
    
        #directory to save our models at
        directory = './', 
        model_name = 'distilGPT2.pt',
)

## Loading a presplit subset of the full dataset data into DataFrames. 
Here, 0 : NOT, 1 : HOF

In [10]:
data_df =  pd.read_csv(args.train_val_csv)
data_df

Unnamed: 0,id,text,average,std,label,split
0,1159552003770650625,@USER dancing is a talent show it offffff like...,0.321070,0.235700,0,train
1,1159542583410024449,Cooking Tweeps! I am desirous of a new ridged...,0.153384,0.168776,0,train
2,1159576849976107009,@USER I can respect that I’ll see I kind of li...,0.161124,0.200444,0,train
3,1159661814160011264,That song used to bring tears to my eyes in 5t...,0.316731,0.168065,0,train
4,1159620953686495233,Nah. Ranked. 1. Abortion 2. Guns 3. Religiou...,0.428485,0.064754,0,train
...,...,...,...,...,...,...
90885,1159536832469774340,This bitch wants people to die in a Mass Murder!,0.860484,0.172114,1,test
90886,1159571431115698176,rly feel like the ugliest girl in the world im...,0.593529,0.200248,1,test
90887,1159547322545725440,@USER I have the ss and was defending her cuz ...,0.790050,0.136956,1,test
90888,1159662323260362752,I THOUGHT SHE WAS PREGNANT BUT THAT WAS HER AS...,0.696626,0.235508,1,test


In [11]:
#0 = NOT, 1 = HOF
data_df.label.value_counts()

0    76402
1    14488
Name: label, dtype: int64

In [12]:
with pd.option_context('display.max_colwidth', -1): 
    print(data_df[['text','label']].sample(5))

                                                                                                                           text  label
31725  @USER BRO I WENT TODAY AND LITERALLY RAN INTO HOT TOPIC TO FIND UR SHIRT ANd i LITERALLY ALMOST CRIED I WANTED IT SO BAD  0    
65508  Always I’ve existed in a state of love. I know of no other way. It’d be an alien tongue, even if I tried to claim it.     0    
42246  @USER @USER Thank you and Nancy but Trump doesn't read and will do nothing.                                               0    
14712  Blink 182 at the Forum ☺️                                                                                                 0    
90625  @USER Why lower the flag to begin with if he's such a racist? Utter stupidity                                             1    


In [13]:
data_df.split.value_counts()

train    63622
test     13635
val      13633
Name: split, dtype: int64

In [14]:
len(data_df)

90890

## Create the text preprocessor

In [0]:
class GPT2Preprocessor:
    def __init__(self, transformer_tokenizer, sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector

    def add_eos_tokens(self, text):
        eos_token = " " + self.transformer_tokenizer.eos_token + " "
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text = (
            eos_token.join(sentences) + " " + self.transformer_tokenizer.eos_token
        )
        return eos_added_text



In [16]:
!python -c 'import nltk; nltk.download("punkt")'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

INFO:filelock:Lock 140245049182248 acquired on /root/.cache/torch/transformers/71cc2431cf0b5bbe7a23601a808ed322c90251c8261b46f04970140a3c2c1cb4.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpaz5osfz5


HBox(children=(IntProgress(value=0, description='Downloading', max=1042301, style=ProgressStyle(description_wi…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json in cache at /root/.cache/torch/transformers/71cc2431cf0b5bbe7a23601a808ed322c90251c8261b46f04970140a3c2c1cb4.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/71cc2431cf0b5bbe7a23601a808ed322c90251c8261b46f04970140a3c2c1cb4.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
INFO:filelock:Lock 140245049182248 released on /root/.cache/torch/transformers/71cc2431cf0b5bbe7a23601a808ed322c90251c8261b46f04970140a3c2c1cb4.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock





INFO:filelock:Lock 140245049182248 acquired on /root/.cache/torch/transformers/4faf7afb02a1ea7d2944e9ba7a175c7b8de4957cdbae75cd5ddffc7c7643ebbc.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp_onaooj3


HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt in cache at /root/.cache/torch/transformers/4faf7afb02a1ea7d2944e9ba7a175c7b8de4957cdbae75cd5ddffc7c7643ebbc.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/4faf7afb02a1ea7d2944e9ba7a175c7b8de4957cdbae75cd5ddffc7c7643ebbc.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
INFO:filelock:Lock 140245049182248 released on /root/.cache/torch/transformers/4faf7afb02a1ea7d2944e9ba7a175c7b8de4957cdbae75cd5ddffc7c7643ebbc.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json from cache at /root/.cache/torch/transformers/71cc2431cf0b5bbe7a23601a808ed322c90251c8261b46f04970140a3c2c1cb4.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61




In [0]:
gpt2_preproc = GPT2Preprocessor(gpt2_tokenizer, punkt_sentence_detector)

In [20]:
data_df['text']

0        @USER dancing is a talent show it offffff like...
1        Cooking Tweeps!  I am desirous of a new ridged...
2        @USER I can respect that I’ll see I kind of li...
3        That song used to bring tears to my eyes in 5t...
4        Nah.  Ranked. 1. Abortion 2. Guns 3.  Religiou...
                               ...                        
90885     This bitch wants people to die in a Mass Murder!
90886    rly feel like the ugliest girl in the world im...
90887    @USER I have the ss and was defending her cuz ...
90888    I THOUGHT SHE WAS PREGNANT BUT THAT WAS HER AS...
90889    @USER Who the hell are you?Have you forgotten ...
Name: text, Length: 90890, dtype: object

In [0]:
#add the special tokens
data_df["text"] = data_df["text"].map(gpt2_preproc.add_eos_tokens)

### Create the torch torch Dataset

In [0]:
dataset = transformer_data_utils.HateDataset(
        data_df=data_df, tokenizer=gpt2_tokenizer
    )
assert dataset._max_seq_length <= 512

In [0]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        out_data_dict['x_data'] = data_dict['x_data'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_index'] = data_dict['x_index']
        out_data_dict['y_target'] = data_dict['y_target'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        yield out_data_dict

# Creating the Classifier 

In [0]:
class SimpleGPT2SequenceClassifier(nn.Module):
    def __init__(
        self, 
        hidden_size: int,
        num_classes:int ,
        max_seq_len:int,
        gpt_model_name:str, 
    ):
        super(SimpleGPT2SequenceClassifier,self).__init__()
        self.gpt2model = GPT2Model.from_pretrained(
            gpt_model_name
        )
        self.fc1 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x_in):
        
        gpt_out = self.gpt2model(x_in)[0] #returns tuple
        batch_size = gpt_out.shape[0]
        prediction_vector = self.fc1(gpt_out.view(batch_size,-1)) #(batch_size , max_len, num_classes)
    
        return prediction_vector


In [32]:
print("Loading Pretrained distilgpt2...")
num_classes = len(set(data_df.label))
hidden_size = dataset._max_seq_length * 768
model = SimpleGPT2SequenceClassifier(
    hidden_size=hidden_size,
    num_classes=num_classes,
    gpt_model_name="distilgpt2",
    max_seq_len=dataset._max_seq_length,
)
print("Finished")

Loading Pretrained distilgpt2...


INFO:filelock:Lock 140241796885528 acquired on /root/.cache/torch/transformers/eb0f77b3f095880586731f57e2fe19060d71d1036ef8daf727bd97a17fb66a43.a8b35e282ef6b386ae29500ff942def3dd5a8bf71de78a3d36221d6d90031bb5.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp5u41klsm


HBox(children=(IntProgress(value=0, description='Downloading', max=651, style=ProgressStyle(description_width=…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json in cache at /root/.cache/torch/transformers/eb0f77b3f095880586731f57e2fe19060d71d1036ef8daf727bd97a17fb66a43.a8b35e282ef6b386ae29500ff942def3dd5a8bf71de78a3d36221d6d90031bb5
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/eb0f77b3f095880586731f57e2fe19060d71d1036ef8daf727bd97a17fb66a43.a8b35e282ef6b386ae29500ff942def3dd5a8bf71de78a3d36221d6d90031bb5
INFO:filelock:Lock 140241796885528 released on /root/.cache/torch/transformers/eb0f77b3f095880586731f57e2fe19060d71d1036ef8daf727bd97a17fb66a43.a8b35e282ef6b386ae29500ff942def3dd5a8bf71de78a3d36221d6d90031bb5.lock
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json from cache at /root/.cache/torch/transformers/eb0f77b3f095880586731f57e2fe19060d71d1036ef8daf727bd97a17fb66a43.a8b35e282ef6b386ae29500ff942def




INFO:filelock:Lock 140241796885136 acquired on /root/.cache/torch/transformers/cd250f30004d0dee11ff1af311bd3facb6f38739fd870b9c8aa9321333a550be.ffe4c53a2a410b15148cf4170cc408d2d2f98adeecdde146ef8e71843039ff3c.lock
INFO:transformers.file_utils:https://cdn.huggingface.co/distilgpt2-pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp0g9a_e4a


HBox(children=(IntProgress(value=0, description='Downloading', max=352833716, style=ProgressStyle(description_…

INFO:transformers.file_utils:storing https://cdn.huggingface.co/distilgpt2-pytorch_model.bin in cache at /root/.cache/torch/transformers/cd250f30004d0dee11ff1af311bd3facb6f38739fd870b9c8aa9321333a550be.ffe4c53a2a410b15148cf4170cc408d2d2f98adeecdde146ef8e71843039ff3c
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/cd250f30004d0dee11ff1af311bd3facb6f38739fd870b9c8aa9321333a550be.ffe4c53a2a410b15148cf4170cc408d2d2f98adeecdde146ef8e71843039ff3c
INFO:filelock:Lock 140241796885136 released on /root/.cache/torch/transformers/cd250f30004d0dee11ff1af311bd3facb6f38739fd870b9c8aa9321333a550be.ffe4c53a2a410b15148cf4170cc408d2d2f98adeecdde146ef8e71843039ff3c.lock
INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/distilgpt2-pytorch_model.bin from cache at /root/.cache/torch/transformers/cd250f30004d0dee11ff1af311bd3facb6f38739fd870b9c8aa9321333a550be.ffe4c53a2a410b15148cf4170cc408d2d2f98adeecdde146ef8e71843039ff3c





INFO:transformers.modeling_utils:Weights of GPT2Model not initialized from pretrained model: ['transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.5.attn.masked_bias']


Finished


# Begin Training

In [33]:
loss_func = nn.CrossEntropyLoss()
base_optimizer = RAdam(model.parameters(), lr=args.learning_rate)
optimizer = Lookahead(optimizer=base_optimizer, k=5, alpha=0.5)
print(f"Using LR:{args.learning_rate}")

Using LR:0.0001


In [0]:
train_state = general_utils.make_train_state()
train_state["ckpt"] = 0
train_state['max_seq_len'] = dataset._max_seq_length

In [35]:
!nvidia-smi

Mon May  4 04:47:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   27C    P8     6W /  75W |     10MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
args.batch_size = 16 #based on your hardware. 1GB per batch.

In [0]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 3, 
    )

    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        y_pred = model(
            input_ids = batch_dict['x_data'],
            attention_mask =  batch_dict['x_attn_mask'],
        )
        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
                             
        loss = loss_func(y_pred, batch_dict['y_target'])
    
        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
                             
        y_pred = y_pred.detach().cpu()
        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils \
            .compute_accuracy(y_pred, batch_dict['y_target'])

        f1_t = transformer_general_utils \
            .compute_macro_f1(y_pred, batch_dict['y_target'], average='weighted')

        train_state['batch_preds'].append(y_pred)
        train_state['batch_targets'].append(batch_dict['y_target'])
        train_state['batch_indexes'].append(batch_dict['x_index'])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                  'weighted'
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = False, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))

            loss = loss_func(y_pred, batch_dict['y_target'])
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'],
                                 average='weighted')

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'])
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                  average='weighted',
                                 )
                                 
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    scheduler.step(val_f1)
    early_stopping(val_f1, model)
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()    
    
    if early_stopping.early_stop:
            print("Early stopping")
            break
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1 )
    epoch_bar.update()

In [0]:
print(train_state['train_f1s'])

In [0]:
print(train_state['val_f1s'])

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [0]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print(f'Best run at epoch {best_run_index}')
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

## Check if ensembling helps and pick models to use on test set

In [0]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

def get_optimal_models(train_state, split, reverse=False ):
    """Naive Ensembling"""
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    rng = range(0,total_preds)
    if reverse:
        rng = reversed(rng)
    for i in rng:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='weighted'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return (idxes,max_f1)

In [0]:
train_state['val_f1s']

In [0]:
best_model_f1_score = f1_score(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    average='weighted'
)
_models= [get_optimal_models(train_state,'val', reverse=False),
                 get_optimal_models(train_state,'val', reverse=True),
                 ([best_run_index],best_model_f1_score),]
optimal_models = max(_models, key=lambda x:x[1]) #select ensembles or best model 
print(f'Optimal models chosen: {optimal_models}')

In [0]:
!ls {args.directory}

In [0]:
all_models= [os.path.join(args.directory,i) for i in os.listdir(args.directory) if args.model_name in i]
all_models = sorted(all_models, key = lambda x: int(x[8])) #sort by epoch num.
all_models

In [0]:
selected_models = [all_models[i] for i in optimal_models[0]]
pprint.pprint(selected_models)

## Loading test set


In [0]:
test_set_loc = '/content/trac2020_submission/data/test/trac2_hin_test.csv'

In [0]:
test_df = pd.read_csv(test_set_loc)

In [0]:
test_df['text'] = test_df['Text'].map(roberta_preproc.add_special_tokens)

In [0]:
test_df['split'] = 'test'  #dummy label
test_df['label'] = -1  #dummy label


In [0]:
test_df

In [0]:
test_dataset = TracDataset(
    data_df = test_df,
    tokenizer = xlmroberta_tokenizer,
    max_seq_length = dataset._max_seq_length
)

In [0]:
test_dataset.set_split('test')


In [0]:
test_dataset._target_df.split.value_counts()

In [0]:
test_state = general_utils.make_train_state() 
test_dataset.set_split('test')
eval_bar = notebook.tqdm(
    desc = 'split=train ',
    total=test_dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
model.eval()
for m in notebook.tqdm(selected_models, total=len(selected_models)):
    eval_bar.reset(
        total=test_dataset.get_num_batches(args.batch_size),
    )
    model.load_state_dict(torch.load(m)['model'])
    batch_generator = generate_batches(
        dataset= test_dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 1, 
    )
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            
            y_pred = y_pred.detach()
            
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            test_state['batch_preds'].append(y_pred.cpu())
            test_state['batch_targets'].append(batch_dict['y_target'].cpu())
            test_state['batch_indexes'].append(batch_dict['x_index'].cpu())
            eval_bar.update()

    test_state['val_preds'].append(
        torch.cat(test_state['batch_preds']).cpu()
    )
    test_state['val_targets'].append(
        torch.cat(test_state['batch_targets']).cpu()
    )
    test_state['val_indexes'].append(
        torch.cat(test_state['batch_indexes']).cpu()
    )
    
    test_state['batch_preds'] = []
    test_state['batch_targets'] = []
    test_state['batch_indexes'] = []


In [0]:
assert len(test_state['val_preds']) == len(optimal_models[0])

### Add the last layer outputs and apply argmax 

In [0]:
ensemble = torch.zeros_like(test_state['val_preds'][-1])
for i in test_state['val_preds']:
    ensemble += i

In [0]:
test_preds = torch.argmax(ensemble, dim=1).tolist()

In [0]:
collections.Counter(test_preds)

In [0]:
# task_b_label_dict = {'NGEN':0, 'GEN':1} #ref Reading TRAC2020 data... ipynb
int_to_label = {0:'NGEN', 1:'GEN'}
pred_labels = [int_to_label[i] for i in test_preds]
collections.Counter(pred_labels)

In [0]:
pred_df = pd.DataFrame( data= {'id':test_df.ID, 'label':pred_labels})

In [0]:
pred_analysis_df = pd.DataFrame( data= {'id':test_df.ID, 'text':test_df.Text ,'label':pred_labels})

In [0]:
pred_df

In [0]:
pred_analysis_df