[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cozek/OffensEval2020-code/blob/master/notebooks/Eng%20Task%20C%20-%20Ensemble%20DistilRoberta%20AttnMask%20Dropout.ipynb)

# Import Libraries

At the time of our work, we used the following library versions
- numpy 1.18.1
- pandas 1.0.1
- torch 1.2.0
- Cuda 10.0
- python 3.7.0
- sklearn 0.22.1
- tqdm 4.42.1
- nltk 3.4.5

In [0]:
!git clone https://github.com/cozek/OffensEval2020-code/

In [0]:
!git clone https://github.com/huggingface/transformers
!pip install /content/transformers/

In [0]:
import sys
sys.path.append('/content/OffensEval2020-code/src/')
import collections
from typing import Callable
import numpy as np
np.random.seed(42)
import pandas as pd
from tqdm import notebook
import importlib
import pprint
import nltk
import datetime
import os
from argparse import Namespace

from collections import Counter

In [0]:
import utils.general as general_utils
import utils.transformer.data as transformer_data_utils
import utils.transformer.general as transformer_general_utils
general_utils.set_seed_everywhere()

In [0]:
import logging
logging.basicConfig(level=logging.INFO) 

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.__version__ # we used version 1.2.0


In [0]:
# Import RAdam and Lookahead
from radam.radam import RAdam
from lookahead.optimizer import Lookahead


In [0]:
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

In [0]:
 args = Namespace(
        #use cuda by default
        device = 'cuda' if torch.cuda.is_available() else 'cpu',
    
        #set batch size and number of epochs
        batch_size = 32,
        num_epochs = 20,
    
        #set the learning rate
        learning_rate = 0.0001,

        #location of the train, dev and test csv
        train_val_csv = '/content/OffensEval2020-code/data/eng/task_c_tiny.zip',
        test_csv = '/content/OffensEval2020-code/data/test_data/test_a_tweets.tsv',
    
        #directory to save our models at
        directory = './models/', 
        model_name = 'roberta_attn_trac_task_a.pt',
     
        date = datetime.datetime.now().strftime("%a_%d_%b_%Y/"),
)

## Model save location

In [0]:
directory = args.directory + args.date
if not os.path.exists(directory):
    os.makedirs(directory)
args.directory = directory
print(args.directory)

## Load presplit dataset portion
```
Labelled as

IND = 0
GRP = 1
OTH = 2
```

In [0]:
data_df_task_c = pd.read_csv(args.train_val_csv, compression='zip')
print(data_df_task_c.label.value_counts())
print(data_df_task_c.split.value_counts())

In [0]:
data_df_task_c.columns

In [0]:
with pd.option_context('display.max_colwidth', -1): 
    print(data_df_task_c[['text','label']].sample(5))

## Importing the Roberta Tokeniker and Punkt sentence tokenizer

In [0]:
class RobertaPreprocessor():
    def __init__(self,transformer_tokenizer,sentence_detector):
        self.transformer_tokenizer = transformer_tokenizer
        self.sentence_detector = sentence_detector
        self.bos_token = transformer_tokenizer.bos_token
        self.sep_token = ' ' + transformer_tokenizer.sep_token + ' '
    def add_special_tokens(self, text):
        sentences = self.sentence_detector.tokenize(text)
        eos_added_text  = self.sep_token.join(sentences) 
        return self.bos_token +' '+ eos_added_text + ' ' + self.transformer_tokenizer.sep_token

In [0]:
!python -c 'import nltk; nltk.download("punkt")'

In [0]:
roberta_tokenizer = tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
punkt_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
roberta_preproc = RobertaPreprocessor(roberta_tokenizer, punkt_sentence_detector)

In [0]:
#apply the preprocessor on the exploded dataframe
data_df_task_c['text'] = data_df_task_c['text'].map(roberta_preproc.add_special_tokens)


In [0]:
with pd.option_context('display.max_colwidth', -1): 
    print(data_df_task_c[['text','label']].sample(5))

### Implement Attention Mask Dropout in the vectorizer

In [0]:
class SimpleVectorizer():
    def __init__(self,tokenizer: Callable, max_seq_len: int):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str):
        
        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preproc
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        
        return ids, attn

class Vectorizer():
    """Vectorizer with Attention Mask Dropout"""
    def __init__(self,tokenizer: Callable, max_seq_len: int ):
        """
        Args:
            tokenizer (Callable): transformer tokenizer
            max_seq_len (int): Maximum sequence lenght 
        """
        self.tokenizer = tokenizer
        self._max_seq_len = max_seq_len

    def vectorize(self,text :str, mask_prob: float = 0.50, mask_amount:float=0.30):
        """Implements Attention Mask Dropout
        
        Args:
            text (str): The string to vectorize
            mask_prob (float): Probability of the attention mask 
                dropout being applied
            mask_amount (float): Percentage of tokens to mask

        Returns:
            ids (np.array)  : Array to token ids of the text
            attn (np.array) : 0-1 Array of attention masks
        """

        encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False, #already added by preproc
            max_length = self._max_seq_len,
            pad_to_max_length = True,
        )
        ids =  np.array(encoded['input_ids'], dtype=np.int64)
        attn = np.array(encoded['attention_mask'], dtype=np.int64)
        prob = np.random.rand(1)[0]
        if  prob <= mask_prob:
            len_of_sent = np.where(ids==tokenizer.pad_token_id)[0][0]
            amount_to_mask = max(int(len_of_sent * mask_amount ) , 1)
            ids_to_not_attend = [np.random.randint(low=0, high=len_of_sent )
             for i in range(amount_to_mask)]
            attn[ids_to_not_attend]=0
            ids[ids_to_not_attend] = tokenizer.mask_token_id
        return ids, attn

Attention Mask Dropout Example

In [0]:
v = Vectorizer(roberta_tokenizer, 15) #attention maskdropout vectorizer
sv = SimpleVectorizer(roberta_tokenizer, 15) #simple vectorizer

In [0]:
sent = "I am alright bro, dont worry about me"
_, attn_masks_dropped = v.vectorize(sent)
attn_masks_dropped

In [0]:
_, attn_masks = sv.vectorize(sent)
attn_masks

###  Create the dataset class

In [0]:
class HateDataset(Dataset):
    def __init__(
        self,
        data_df: pd.DataFrame,
        tokenizer: Callable,
        max_seq_length:int = None,
    ):
        """
        Args:
            data_df (pandas.DataFrame): df containing the labels and text
            tokenizer (tokenizer module for the transformer)
        """
        self.data_df = data_df
        self.tokenizer = tokenizer

        if max_seq_length is None:
            self._max_seq_length = self._get_max_len(data_df,tokenizer)
        else:
            self._max_seq_length = max_seq_length

        self.train_df = self.data_df[self.data_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.data_df[self.data_df.split == 'val']
        self.val_size = len(self.val_df)

        self.test_df = self.data_df[self.data_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self.simple_vectorize = False,
        self._simple_vectorizer = SimpleVectorizer(tokenizer, self._max_seq_length)
        self._vectorizer = Vectorizer(tokenizer, self._max_seq_length)
        
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'val': (self.val_df, self.val_size),
            'test': (self.test_df, self.test_size)
        }

        self.set_split('train')

        class_counts = data_df.label.value_counts().to_dict()
         #sorted on the basis of class label,eg, 0,1,2..
        cts = sorted([(lbl,cts) for lbl,cts in class_counts.items()], key=lambda x: x[0])
        freq = [ x[1] for x in cts ]
        # print(freq,cts)
        self.class_weights = 1.0/ torch.tensor(freq, dtype=torch.float32)
    
    def flip_simple_vectorizer(self) :
        if self.simple_vectorize:
            self.simple_vectorize=False
        else:
            self.simple_vectorize= True
    
    def _get_max_len(self,data_df: pd.DataFrame, tokenizer: Callable):
        len_func = lambda x: len(self.tokenizer.encode_plus(x)['input_ids'])
        max_len = data_df.text.map(len_func).max() 
        return max_len

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        if self._target_split == 'train':
            indices, attention_masks = self._vectorizer.vectorize(row.text)
        else:
            indices, attention_masks = self._simple_vectorizer.vectorize(row.text)

        label = row.label
        return {'x_data': indices,
                'x_attn_mask': attention_masks,
                'x_index': index,
                'y_target': label}
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [0]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=False, device="cpu", pinned_memory = False, n_workers = 0): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last,
                            pin_memory= pinned_memory,
                            num_workers = n_workers,
                            )
    
    for data_dict in dataloader:
        out_data_dict = {}
        out_data_dict['x_data'] = data_dict['x_data'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_attn_mask'] = data_dict['x_attn_mask'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        out_data_dict['x_index'] = data_dict['x_index']
        out_data_dict['y_target'] = data_dict['y_target'].to(
            device, non_blocking= (True if pinned_memory else False) 
        )
        yield out_data_dict

In [0]:
dataset = HateDataset(
    data_df = data_df_task_c,
    tokenizer = roberta_tokenizer
)

In [0]:
assert dataset._max_seq_length <= 512

# Initialize the Roberta model




In [0]:
model = RobertaForSequenceClassification.from_pretrained(
    'distilroberta-base',
    num_labels=len(set(data_df_task_c.label)),
)

In [0]:
model.to(args.device)

In [0]:
early_stopping = transformer_general_utils.EarlyStopping(patience=4)

In [0]:
!nvidia-smi

In [0]:
args.num_epochs = 20
args.batch_size = 70

In [0]:
loss_func = nn.CrossEntropyLoss()

print(f'Using LR:{args.learning_rate}')
base_optimizer = RAdam(model.parameters(), lr = args.learning_rate)
optimizer = Lookahead(optimizer = base_optimizer, k = 5, alpha=0.5 )
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer.optimizer, factor =0.1 ,mode='max',
)

# Begin Training

In [0]:
train_state = general_utils.make_train_state()
train_state.keys()


In [0]:
epoch_bar = notebook.tqdm(
    desc = 'training_routine',
    total = args.num_epochs,
    position=0,
    leave = True,
)
dataset.set_split('train')
train_bar = notebook.tqdm(
    desc = 'split=train ',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)
dataset.set_split('val')
eval_bar = notebook.tqdm(
    desc = 'split=eval',
    total=dataset.get_num_batches(args.batch_size),
    position=0,
    leave=True,
)

old_val_acc = 0
old_f1 = 0
model_state = None
for epoch_index in range(args.num_epochs):
    train_state['epoch_in'] = epoch_index

    dataset.set_split('train')

    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 3, 
    )

    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    model.train()

    train_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )

    for batch_index, batch_dict in enumerate(batch_generator):
        optimizer.zero_grad()
        
        loss,y_pred = model(
            input_ids = batch_dict['x_data'],
            attention_mask =  batch_dict['x_attn_mask'],
            labels= batch_dict['y_target'].unsqueeze(1),
        )[:2]
        
        y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
                             
#         scheduler.step()
        loss.backward()
        optimizer.step()
                             
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)
                             
        y_pred = y_pred.detach().cpu()
        batch_dict['y_target'] = batch_dict['y_target'].cpu()
        
        acc_t = transformer_general_utils \
            .compute_accuracy(y_pred, batch_dict['y_target'])
        
        f1_t = transformer_general_utils \
            .compute_macro_f1(y_pred, batch_dict['y_target'])

        train_state['batch_preds'].append(y_pred)
        train_state['batch_targets'].append(batch_dict['y_target'])
        train_state['batch_indexes'].append(batch_dict['x_index'])

        running_acc += (acc_t - running_acc) / (batch_index + 1)
        running_f1 += (f1_t - running_f1) / (batch_index + 1)

        train_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                             epoch=epoch_index)

        train_bar.update()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    train_state['train_accuracies'].append(running_acc)
    train_state['train_losses'].append(running_loss)
    
    train_state['train_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )
    train_state['train_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['train_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    train_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['train_preds'][-1],
                                  train_state['train_targets'][-1],
                                 )
                                 
    train_state['train_f1s'].append(train_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    
    dataset.set_split('val')
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=True,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    running_loss = 0.0
    running_acc = 0.0
    running_f1 = 0.0
    
    model.eval()
    with torch.no_grad():
        optimizer._backup_and_load_cache()
        for batch_index, batch_dict in enumerate(batch_generator):
            loss, y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
                labels= batch_dict['y_target'].unsqueeze(1),
            )[:2]
            y_pred = y_pred.view(-1, len(set(dataset.data_df.label)))
            
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            y_pred = y_pred.detach()
            batch_dict['y_target'] = batch_dict['y_target'].cpu()
            
            acc_t = transformer_general_utils\
                .compute_accuracy(y_pred, batch_dict['y_target'])
            f1_t = transformer_general_utils \
                .compute_macro_f1(y_pred, batch_dict['y_target'])

            train_state['batch_preds'].append(y_pred.cpu())
            train_state['batch_targets'].append(batch_dict['y_target'].cpu())
            train_state['batch_indexes'].append(batch_dict['x_index'].cpu())

            running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_f1 += (f1_t - running_f1) / (batch_index + 1)
            

            eval_bar.set_postfix(loss = running_loss, f1 = running_f1, acc=running_acc,
                                 epoch=epoch_index)
            eval_bar.update()
            
    train_state['val_accuracies'].append(running_acc)
    train_state['val_losses'].append(running_loss)
    
        
    train_state['val_preds'].append(
        torch.cat(train_state['batch_preds']).cpu()
    )

    train_state['val_targets'].append(
        torch.cat(train_state['batch_targets']).cpu()
    )
    train_state['val_indexes'].append(
        torch.cat(train_state['batch_indexes']).cpu()
    )
    val_f1 = transformer_general_utils \
                .compute_macro_f1(train_state['val_preds'][-1],
                                  train_state['val_targets'][-1],
                                 )
                                 
    train_state['val_f1s'].append(val_f1)
    
    train_state['batch_preds'] = []
    train_state['batch_targets'] = []
    train_state['batch_indexes'] = []
    
    torch.save(
        {
            'model':model.state_dict(),
        },
        args.directory + f'_epoc_{epoch_index}_' + args.model_name,
    )
    
    scheduler.step(val_f1)
    early_stopping(val_f1, model)
    optimizer._clear_and_load_backup()
    epoch_bar.set_postfix( best_f1 = early_stopping.best_score, current = val_f1)
    epoch_bar.update()    
    
    if early_stopping.early_stop:
        print("Early stopping")
        break


In [0]:
epoch_index

In [0]:
print(train_state['val_f1s'])

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [0]:

print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][-1],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][-1].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][-1],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][-1].cpu().numpy(), 
    digits=4)
)


In [0]:
best_run_index = train_state['val_f1s'].index(max(train_state['val_f1s']))
print('Train:',classification_report(
    y_pred=(torch.argmax(train_state['train_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['train_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)
print('Dev:',classification_report(
    y_pred=(torch.argmax(train_state['val_preds'][best_run_index],dim=1) ).cpu().long().numpy(),
    y_true= train_state['val_targets'][best_run_index].cpu().numpy(), 
    digits=4)
)

In [0]:
def sort_preds(indexes, preds):
    """Sorts the predictions in order, to reverse the effects of shuffle
    done by dataloader"""
    indexes = indexes.cpu().numpy().reshape(-1,1)
    preds = preds.cpu().numpy()
    arr_concat = np.hstack((indexes,preds)) #concat the preds and their indexes
    sort_arr = arr_concat[ arr_concat[:,0].argsort()] #sort based on the indexes
    sorted_preds = np.delete(sort_arr,0,axis=1)
    return sorted_preds

In [0]:
def get_optimal_models_v2(train_state, split):
    l = zip(train_state[f'{split}_f1s'], range(len(train_state[f'{split}_f1s'])))
    sorted_vals = sorted(l, key = lambda x:x[0], reverse=True)
    model_idxes = [i[1] for i in sorted_vals]
    
    trgts= sort_preds(train_state[f'{split}_indexes'][-1],train_state[f'{split}_targets'][-1].reshape(-1,1))
    total_preds = len(train_state[f'{split}_indexes'])
    init = np.zeros(train_state[f'{split}_preds'][-1].shape)
    max_f1 = 0
    idxes = []
    for i in model_idxes:
        temp = sort_preds(train_state[f'{split}_indexes'][i],train_state[f'{split}_preds'][i])
        temp2 = init+temp
        f1 = f1_score(
            y_pred=temp2.argmax(axis=1),
            y_true= trgts, average ='macro'
        )
        if f1 > max_f1:
            max_f1 = f1
            init = init+temp
            idxes.append(i)
    print(f'Taking preds from {idxes} | Dev f1:{f1}')
    return idxes

In [0]:
final_optimal_models = get_optimal_models_v2(train_state, 'val')
final_optimal_models

# Making preds on the given test set

In [0]:
test_df = data_df_task_c

In [0]:
test_dataset = dataset
test_dataset.set_split('test')

In [0]:
test_dataset._target_df.sample(5)

In [0]:
print(len(test_df))
print(test_dataset._target_df.split.value_counts())

In [0]:
def evaluate_testset(model, state, dataset, split,args):
    """Returns the final layer output of our transformer model
    Puts them in the '{split}_*' keys in the state dict
    Args:
        model: A pytorch transformers model
        state: dict to store outputs
        dataset: A pytorch Dataset
        split: The split on which to evaluate the model on
        args: Arguments from namespace, etc
    Returns:
        state: all evaluated output stored in the "test" key
    """
    eval_bar = notebook.tqdm(
        desc = 'evaluation progress: ',
        total=dataset.get_num_batches(args.batch_size),
        position=0,
        leave=False,
    )
    dataset.set_split(split)
    batch_generator = generate_batches(
        dataset= dataset, batch_size= args.batch_size, shuffle=False,
        device = args.device, drop_last=False,
        pinned_memory = True, n_workers = 2, 
    )
    eval_bar.reset(
        total=dataset.get_num_batches(args.batch_size),
    )
    model.eval()
    with torch.no_grad():
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(
                input_ids = batch_dict['x_data'],
                attention_mask =  batch_dict['x_attn_mask'],
            )[0]
            y_pred = y_pred.view(-1, 3)

            y_pred = y_pred.detach()
            
            state['batch_preds'].append(y_pred.cpu())
            state['batch_indexes'].append(batch_dict['x_index'].cpu())
            
            eval_bar.update()
            
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    state[f'{split}_preds'].append(
        torch.cat(state['batch_preds']).cpu()
    )
    state[f'{split}_indexes'].append(
        torch.cat(state['batch_indexes']).cpu()
    )
    
    state['batch_preds'] = []
    state['batch_indexes'] = []
    
    eval_bar.close()
    return state

In [0]:
chosen_models = [all_model_paths[i] for i in final_optimal_models]

In [0]:
test_state = general_utils.make_train_state()
for model_path in notebook.tqdm(chosen_models, total=len(chosen_models)):
    model.load_state_dict(torch.load(model_path)['model'])
    test_state = evaluate_testset(model, test_state, test_dataset, 'test',args)

In [0]:
test_state['test_preds'][-1].shape

In [0]:
[test_state['test_preds'][i].size() for i in range(len(test_state['test_preds']))]

In [0]:
len(test_dataset._target_df)

In [0]:
torch.zeros_like(test_state['test_preds'][0]).size()

In [0]:
ensemble_pred = torch.zeros_like(test_state['test_preds'][0])
for i in test_state['test_preds']:
    ensemble_pred += i

In [0]:
# label_dict["IND"] = 0
# label_dict["GRP"] = 1
# label_dict["OTH"] = 2
#ref utils/offeval2020.py

In [0]:
int_to_label = { 0: 'IND', 1:'GRP', 2:'OTH'}

In [0]:
t = []
for i in torch.argmax(ensemble_pred, dim=1):
    t.append(int_to_label[i.item()])

collections.Counter(t)

In [0]:
assert len(t) == len(test_df)

In [0]:
offeval_task_c_pred_analysis_df = pd.DataFrame(
    data={
        'id':test_df.id,
        'text':test_df.tweet,
        'label':t,
    }
)

In [0]:
offeval_task_c_pred_label_df = pd.DataFrame(
    data={
        'id':test_df.id,
        'label':t,
    }
)

In [0]:
offeval_task_c_pred_analysis_df.to_csv(
    'offeval_task_c_pred_analysis.csv',index=False,
)

In [0]:
offeval_task_c_pred_label_df.to_csv(
    'offeval_task_c_pred_label.csv', index=False, header=False,
)

In [0]:
offeval_task_c_pred_label_df.label.value_counts()


In [0]:
offeval_task_c_pred_label_df.label.value_counts()