In [138]:
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
import transformers
import pandas as pd
from collections import OrderedDict
import operator

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss

# Load modules, mainly huggingface basic model handlers.
# Make sure you install huggingface and other packages properly.
from collections import Counter
import json

from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import matthews_corrcoef
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

import logging
logger = logging.getLogger(__name__)

import os
os.environ["TRANSFORMERS_CACHE"] = "../huggingface_cache/" # Not overload common dir 
                                                           # if run in shared resources.

import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import torch
import argparse
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from datasets import Dataset
from datasets import DatasetDict
from tqdm import tqdm, trange

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback
)
from transformers.trainer_utils import is_main_process, EvaluationStrategy

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
font = {'family' : 'Times New Roman',
        'size'   : 30}
plt.rc('font', **font)

#### Setups

In [6]:
def get_dataset(inoculation_data_path, eval_data_path=None, test_data_path=None,
                inoculation_step_sample_size=1.0, 
                eval_sample_limit=-1, seed=42):
    """
    eval_data_path is not needed if it is a saved_to_disk 
    huggingface dataset.
    
    return type is already a huggingface dataset.
    """
    pd_format = True
    if inoculation_data_path.split(".")[-1] != "tsv":
        if len(inoculation_data_path.split(".")) > 1:
            logger.info(f"***** Loading pre-loaded datasets from the disk directly! *****")
            pd_format = False
            datasets = DatasetDict.load_from_disk(inoculation_data_path)
            inoculation_step_sample_size = int(len(datasets["train"]) * inoculation_step_sample_size)
            logger.info(f"***** Inoculation Sample Count: %s *****"%(inoculation_step_sample_size))
            # this may not always start for zero inoculation
            datasets["train"] = datasets["train"].shuffle(seed=seed)
            inoculation_train_df = datasets["train"].select(range(inoculation_step_sample_size))
            eval_df = datasets["validation"]
            datasets["validation"] = datasets["validation"].shuffle(seed=seed)
            if eval_sample_limit != -1:
                datasets["validation"] = datasets["validation"].select(range(eval_sample_limit))
        else:
            logger.info(f"***** Loading downloaded huggingface datasets: {inoculation_data_path}! *****")
            pd_format = False
            if inoculation_data_path in ["sst3", "cola", "mnli", "snli", "mrps", "qnli"]:
                pass
            raise NotImplementedError()
    else:
        train_df = pd.read_csv(inoculation_data_path, delimiter="\t")
        eval_df = pd.read_csv(eval_data_path, delimiter="\t")
        test_df = pd.read_csv(test_data_path, delimiter="\t")
        inoculation_step_sample_size = int(len(train_df) * inoculation_step_sample_size)
        logger.info(f"***** Inoculation Sample Count: %s *****"%(inoculation_step_sample_size))
        # this may not always start for zero inoculation
        inoculation_train_df = train_df.sample(n=inoculation_step_sample_size, 
                                               replace=False, 
                                               random_state=seed) # seed here could not a little annoying.
    if pd_format:
        datasets = {}
        datasets["train"] = Dataset.from_pandas(inoculation_train_df)
        datasets["validation"] = Dataset.from_pandas(eval_df)
        datasets["test"] = Dataset.from_pandas(test_df)
    else:
        datasets = {}
        datasets["train"] = inoculation_train_df
        datasets["validation"] = eval_df
    return datasets

In [7]:
TASK_CONFIG = {
    "sst3": ("text", None),
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "snli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence")
}
# WARNING: you dont need BERT tokenizer
# original_vocab = load_bert_vocab("../data-files/bert_vocab.txt")
# original_tokenizer = transformers.BertTokenizer(
#     vocab_file="../data-files/bert_vocab.txt")
# Just use some basic white space tokenizor here!
modified_basic_tokenizer = ModifiedBasicTokenizer()
max_length = 128
per_device_train_batch_size = 128
per_device_eval_batch_size = 128
no_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
n_gpu = torch.cuda.device_count() if not no_cuda else 1 # 1 means just on cpu
seed = 42
lr = 1e-3
num_train_epochs = 10
task_name = "sst3"
sentence1_key, sentence2_key = TASK_CONFIG[task_name]

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0 and not no_cuda:
    torch.cuda.manual_seed_all(args.seed)

In [8]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
datasets = get_dataset(f"../data-files/{data_file_name}/{data_file_name}-train.tsv", 
                       f"../data-files/{data_file_name}/{data_file_name}-dev.tsv", 
                       f"../data-files/{data_file_name}/{data_file_name}-test.tsv")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))
logger.info(f"***** Test Sample Count (Verify): %s *****"%(len(datasets["test"])))

03/17/2021 15:41:03 - INFO - __main__ - ***** Inoculation Sample Count: 159274 *****
03/17/2021 15:41:03 - INFO - __main__ - ***** Train Sample Count (Verify): 159274 *****
03/17/2021 15:41:03 - INFO - __main__ - ***** Valid Sample Count (Verify): 1100 *****
03/17/2021 15:41:03 - INFO - __main__ - ***** Test Sample Count (Verify): 2210 *****


#### BoW preprocessor

In [270]:
# create the vocab file
vocab_index = 0
original_vocab = OrderedDict()
if "train" in datasets:
    for (ex_index, example) in enumerate(tqdm(datasets["train"])):
        if sentence2_key is None:
            sentence_combined = example[sentence1_key]
        else:
            sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        for token in sentence_tokens:
            if token not in original_vocab.keys():
                original_vocab[token] = vocab_index
                vocab_index += 1
train_data_only = False
if not train_data_only:
    if "validation" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["validation"])):
            if sentence2_key is None:
                sentence_combined = example[sentence1_key]
            else:
                sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
            sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

    if "test" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["test"])):
            if sentence2_key is None:
                sentence_combined = example[sentence1_key]
            else:
                sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
            sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

100%|██████████| 159274/159274 [00:26<00:00, 5916.54it/s]
100%|██████████| 1100/1100 [00:00<00:00, 2903.59it/s]
100%|██████████| 2210/2210 [00:00<00:00, 2851.08it/s]


In [283]:
# BoW feature vectors for train split
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(datasets["train"])):
    bow_feature = torch.zeros(len(original_vocab))
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    if ex_index % 50000 == 0:
        print("Example sentence: " + sentence_combined)
    for t in sentence_tokens:
        bow_feature[original_vocab[t]] += 1
    train_input_features.append(bow_feature)
    train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

  0%|          | 572/159274 [00:00<00:55, 2851.42it/s]

Example sentence: Surprisingly, considering that Baird is a former film editor, the movie is rather choppy.


 32%|███▏      | 50479/159274 [00:15<00:26, 4127.77it/s]

Example sentence: achronological


 63%|██████▎   | 100487/159274 [00:27<00:14, 4011.50it/s]

Example sentence: Show


 95%|█████████▍| 150776/159274 [00:40<00:02, 4030.62it/s]

Example sentence: picked me up ,


100%|██████████| 159274/159274 [00:42<00:00, 3767.60it/s]


In [284]:
# BoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(datasets["validation"])):
    bow_feature = torch.zeros(len(original_vocab))
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    for t in sentence_tokens:
        if t in original_vocab.keys():
            bow_feature[original_vocab[t]] += 1
    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

100%|██████████| 1100/1100 [00:00<00:00, 1953.40it/s]


In [285]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

#### BoW Classifer

In [290]:
class BOWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BOWClassifier, self).__init__()
        self.classifier = nn.Linear(vocab_size, num_labels, bias=True)
    def forward(self, x, labels=None):
        logits = self.classifier(x)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits

In [287]:
class MockBERTBOWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(MockBERTBOWClassifier, self).__init__()
        hidden_dim = 32
        self.mock_bert = nn.Linear(vocab_size, hidden_dim, bias=False)
        self.mock_activation = nn.Tanh()
        self.classifier = nn.Linear(hidden_dim, num_labels, bias=False)
    def forward(self, x, labels=None):
        cls = self.mock_activation(self.mock_bert(x))
        logits = self.classifier(cls)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits

In [291]:
# some overriding fun stuffs!
lr = 1e-3
num_train_epochs = 10
model = BOWClassifier(len(validation_label_ids.unique()), len(original_vocab))
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

#### Main training loop

In [292]:
global_step = 0
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            print(classification_report(all_label_ids, all_logits, digits=5))
            print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])
                    
        global_step += 1

03/18/2021 02:46:40 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.37189   0.48832   0.42222       428
           1    0.51042   0.11036   0.18148       444
           2    0.20362   0.39474   0.26866       228

    accuracy                        0.31636      1100
   macro avg    0.36197   0.33114   0.29079      1100
weighted avg    0.39293   0.31636   0.29322      1100

Macro-F1:  0.2907868067072047


03/18/2021 02:46:42 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.67913   0.50935   0.58211       428
           1    0.63620   0.78378   0.70232       444
           2    0.33621   0.34211   0.33913       228

    accuracy                        0.58545      1100
   macro avg    0.55051   0.54508   0.54119      1100
weighted avg    0.59072   0.58545   0.58027      1100

Macro-F1:  0.541186934026759


03/18/2021 02:46:45 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65316   0.60280   0.62697       428
           1    0.66224   0.78604   0.71885       444
           2    0.35393   0.27632   0.31034       228

    accuracy                        0.60909      1100
   macro avg    0.55645   0.55505   0.55206      1100
weighted avg    0.59480   0.60909   0.59843      1100

Macro-F1:  0.5520552870437704


03/18/2021 02:46:47 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.66114   0.65187   0.65647       428
           1    0.69231   0.79054   0.73817       444
           2    0.36257   0.27193   0.31078       228

    accuracy                        0.62909      1100
   macro avg    0.57201   0.57145   0.56847      1100
weighted avg    0.61183   0.62909   0.61779      1100

Macro-F1:  0.5684726258647795


03/18/2021 02:46:49 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.67143   0.65888   0.66509       428
           1    0.69583   0.78829   0.73918       444
           2    0.36158   0.28070   0.31605       228

    accuracy                        0.63273      1100
   macro avg    0.57628   0.57596   0.57344      1100
weighted avg    0.61705   0.63273   0.62265      1100

Macro-F1:  0.5734400228985358


03/18/2021 02:46:52 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.67734   0.64252   0.65947       428
           1    0.68750   0.79279   0.73640       444
           2    0.36264   0.28947   0.32195       228

    accuracy                        0.63000      1100
   macro avg    0.57583   0.57493   0.57261      1100
weighted avg    0.61621   0.63000   0.62056      1100

Macro-F1:  0.5726084384049042


03/18/2021 02:46:54 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.67619   0.66355   0.66981       428
           1    0.69643   0.79054   0.74051       444
           2    0.35227   0.27193   0.30693       228

    accuracy                        0.63364      1100
   macro avg    0.57496   0.57534   0.57242      1100
weighted avg    0.61722   0.63364   0.62313      1100

Macro-F1:  0.5724161143126493


03/18/2021 02:46:56 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.68780   0.65888   0.67303       428
           1    0.69685   0.79730   0.74370       444
           2    0.37363   0.29825   0.33171       228

    accuracy                        0.64000      1100
   macro avg    0.58609   0.58481   0.58281      1100
weighted avg    0.62634   0.64000   0.63081      1100

Macro-F1:  0.5828119407725835


03/18/2021 02:46:58 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.68841   0.66589   0.67696       428
           1    0.69591   0.80405   0.74608       444
           2    0.39306   0.29825   0.33915       228

    accuracy                        0.64545      1100
   macro avg    0.59246   0.58940   0.58740      1100
weighted avg    0.63022   0.64545   0.63484      1100

Macro-F1:  0.5873977481184788


03/18/2021 02:47:01 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.68720   0.67757   0.68235       428
           1    0.70217   0.80180   0.74869       444
           2    0.40351   0.30263   0.34586       228

    accuracy                        0.65000      1100
   macro avg    0.59763   0.59400   0.59230      1100
weighted avg    0.63444   0.65000   0.63938      1100

Macro-F1:  0.5923010656473559


03/18/2021 02:47:03 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70250   0.65654   0.67874       428
           1    0.69844   0.80856   0.74948       444
           2    0.39785   0.32456   0.35749       228

    accuracy                        0.64909      1100
   macro avg    0.59960   0.59655   0.59524      1100
weighted avg    0.63772   0.64909   0.64071      1100

Macro-F1:  0.5952366544633042


03/18/2021 02:47:06 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70647   0.66355   0.68434       428
           1    0.70276   0.80405   0.75000       444
           2    0.41053   0.34211   0.37321       228

    accuracy                        0.65364      1100
   macro avg    0.60658   0.60324   0.60251      1100
weighted avg    0.64363   0.65364   0.64635      1100

Macro-F1:  0.6025143636747948


03/18/2021 02:47:08 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71287   0.67290   0.69231       428
           1    0.70238   0.79730   0.74684       444
           2    0.40104   0.33772   0.36667       228

    accuracy                        0.65364      1100
   macro avg    0.60543   0.60264   0.60194      1100
weighted avg    0.64400   0.65364   0.64682      1100

Macro-F1:  0.6019366006707779


03/18/2021 02:47:10 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71679   0.66822   0.69166       428
           1    0.69960   0.79730   0.74526       444
           2    0.39487   0.33772   0.36407       228

    accuracy                        0.65182      1100
   macro avg    0.60376   0.60108   0.60033      1100
weighted avg    0.64313   0.65182   0.64539      1100

Macro-F1:  0.6003286472776027


03/18/2021 02:47:13 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71429   0.64252   0.67651       428
           1    0.69396   0.80180   0.74399       444
           2    0.39109   0.34649   0.36744       228

    accuracy                        0.64545      1100
   macro avg    0.59978   0.59694   0.59598      1100
weighted avg    0.63909   0.64545   0.63969      1100

Macro-F1:  0.595980088692044


03/18/2021 02:47:15 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70951   0.64486   0.67564       428
           1    0.70400   0.79279   0.74576       444
           2    0.37441   0.34649   0.35991       228

    accuracy                        0.64273      1100
   macro avg    0.59597   0.59471   0.59377      1100
weighted avg    0.63783   0.64273   0.63850      1100

Macro-F1:  0.5937713968501757


03/18/2021 02:47:17 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71538   0.65187   0.68215       428
           1    0.69201   0.79955   0.74190       444
           2    0.38579   0.33333   0.35765       228

    accuracy                        0.64545      1100
   macro avg    0.59773   0.59492   0.59390      1100
weighted avg    0.63763   0.64545   0.63901      1100

Macro-F1:  0.5939001414833728


03/18/2021 02:47:20 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71354   0.64019   0.67488       428
           1    0.69841   0.79279   0.74262       444
           2    0.37736   0.35088   0.36364       228

    accuracy                        0.64182      1100
   macro avg    0.59644   0.59462   0.59371      1100
weighted avg    0.63775   0.64182   0.63771      1100

Macro-F1:  0.5937097482274262


03/18/2021 02:47:22 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70025   0.66589   0.68263       428
           1    0.70200   0.79054   0.74364       444
           2    0.38342   0.32456   0.35154       228

    accuracy                        0.64545      1100
   macro avg    0.59522   0.59366   0.59261      1100
weighted avg    0.63528   0.64545   0.63863      1100

Macro-F1:  0.5926075804428022


03/18/2021 02:47:25 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70558   0.64953   0.67640       428
           1    0.70281   0.78829   0.74310       444
           2    0.37019   0.33772   0.35321       228

    accuracy                        0.64091      1100
   macro avg    0.59286   0.59185   0.59090      1100
weighted avg    0.63495   0.64091   0.63633      1100

Macro-F1:  0.590903274541359


03/18/2021 02:47:27 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.70927   0.66121   0.68440       428
           1    0.69739   0.78378   0.73807       444
           2    0.38119   0.33772   0.35814       228

    accuracy                        0.64364      1100
   macro avg    0.59595   0.59424   0.59354      1100
weighted avg    0.63648   0.64364   0.63844      1100

Macro-F1:  0.5935369917690262


03/18/2021 02:47:30 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.71684   0.65654   0.68537       428
           1    0.70200   0.79054   0.74364       444
           2    0.37981   0.34649   0.36239       228

    accuracy                        0.64636      1100
   macro avg    0.59955   0.59786   0.59713      1100
weighted avg    0.64099   0.64636   0.64194      1100

Macro-F1:  0.597131747518688


03/18/2021 02:47:32 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.72280   0.65187   0.68550       428
           1    0.70120   0.79279   0.74419       444
           2    0.37736   0.35088   0.36364       228

    accuracy                        0.64636      1100
   macro avg    0.60045   0.59851   0.59778      1100
weighted avg    0.64248   0.64636   0.64248      1100

Macro-F1:  0.5977753652172257


03/18/2021 02:47:34 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.72487   0.64019   0.67990       428
           1    0.70868   0.77252   0.73922       444
           2    0.36975   0.38596   0.37768       228

    accuracy                        0.64091      1100
   macro avg    0.60110   0.59956   0.59894      1100
weighted avg    0.64473   0.64091   0.64120      1100

Macro-F1:  0.5989357619271282


03/18/2021 02:47:37 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.72040   0.66822   0.69333       428
           1    0.70363   0.78604   0.74255       444
           2    0.38164   0.34649   0.36322       228

    accuracy                        0.64909      1100
   macro avg    0.60189   0.60025   0.59970      1100
weighted avg    0.64342   0.64909   0.64478      1100

Macro-F1:  0.599701638542431


#### Evaluations with frequency-matched scrambling

In [282]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
corrupt_method = "S2"
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
corrupt_datasets = get_dataset(f"../data-files/{data_file_name}-corrupted-{corrupt_method}")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))

corrupt_validation_input_features = []
corrupt_validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(corrupt_datasets["validation"])):
    bow_feature = torch.zeros(len(original_vocab))
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    for t in sentence_tokens:
        bow_feature[original_vocab[t]] += 1
    corrupt_validation_input_features.append(bow_feature)
    corrupt_validation_label_ids.append(example["label"])
    
corrupt_validation_input_features = torch.stack(corrupt_validation_input_features, dim=0)
corrupt_validation_input_features = torch.tensor(corrupt_validation_input_features, dtype=torch.float)
corrupt_validation_label_ids = torch.tensor(corrupt_validation_label_ids, dtype=torch.long)
corrupt_validation_data = TensorDataset(corrupt_validation_input_features, corrupt_validation_label_ids)
corrupt_validation_dataloader = DataLoader(corrupt_validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

logger.info("***** Evaluation With Corrupt Data *****")
model.eval()
all_logits = []
all_label_ids = []
with torch.no_grad():
    # pbar = tqdm(validation_dataloader, desc="Iteration")
    for step, batch in enumerate(corrupt_validation_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, logits = model(input_features, labels=label_ids)
        logits = F.softmax(logits, dim=-1)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        all_logits.append(outputs)
        all_label_ids.append(label_ids)

all_logits = np.concatenate(all_logits, axis=0)
all_label_ids = np.concatenate(all_label_ids, axis=0)
result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
print(classification_report(all_label_ids, all_logits, digits=5))
print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])

03/18/2021 02:43:20 - INFO - __main__ - ***** Loading pre-loaded datasets from the disk directly! *****
03/18/2021 02:43:20 - INFO - __main__ - ***** Inoculation Sample Count: 159274 *****
Loading cached shuffled indices for dataset at ../data-files/sst-tenary-corrupted-S2/train/cache-3726aee8a4df65db.arrow
Loading cached shuffled indices for dataset at ../data-files/sst-tenary-corrupted-S2/validation/cache-a383bac85d9577d4.arrow
03/18/2021 02:43:20 - INFO - __main__ - ***** Train Sample Count (Verify): 159274 *****
03/18/2021 02:43:20 - INFO - __main__ - ***** Valid Sample Count (Verify): 1100 *****
100%|██████████| 1100/1100 [00:00<00:00, 1359.52it/s]
03/18/2021 02:43:21 - INFO - __main__ - ***** Evaluation With Corrupt Data *****


              precision    recall  f1-score   support

           0    0.33585   0.20794   0.25685       428
           1    0.37057   0.30631   0.33539       444
           2    0.19872   0.40789   0.26724       228

    accuracy                        0.28909      1100
   macro avg    0.30171   0.30738   0.28649      1100
weighted avg    0.32144   0.28909   0.29071      1100

Macro-F1:  0.28649468184524945


#### Evaluations with frequency-unmatched scrambling

In [281]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
corrupt_method = "S3"
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
corrupt_datasets = get_dataset(f"../data-files/{data_file_name}-corrupted-{corrupt_method}")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))

corrupt_validation_input_features = []
corrupt_validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(corrupt_datasets["validation"])):
    bow_feature = torch.zeros(len(original_vocab))
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    for t in sentence_tokens:
        bow_feature[original_vocab[t]] += 1
    corrupt_validation_input_features.append(bow_feature)
    corrupt_validation_label_ids.append(example["label"])
    
corrupt_validation_input_features = torch.stack(corrupt_validation_input_features, dim=0)
corrupt_validation_input_features = torch.tensor(corrupt_validation_input_features, dtype=torch.float)
corrupt_validation_label_ids = torch.tensor(corrupt_validation_label_ids, dtype=torch.long)
corrupt_validation_data = TensorDataset(corrupt_validation_input_features, corrupt_validation_label_ids)
corrupt_validation_dataloader = DataLoader(corrupt_validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

logger.info("***** Evaluation With Corrupt Data *****")
model.eval()
all_logits = []
all_label_ids = []
with torch.no_grad():
    # pbar = tqdm(validation_dataloader, desc="Iteration")
    for step, batch in enumerate(corrupt_validation_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, logits = model(input_features, labels=label_ids)
        logits = F.softmax(logits, dim=-1)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        all_logits.append(outputs)
        all_label_ids.append(label_ids)

all_logits = np.concatenate(all_logits, axis=0)
all_label_ids = np.concatenate(all_label_ids, axis=0)
result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
print(classification_report(all_label_ids, all_logits, digits=5))
print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])

03/18/2021 02:43:14 - INFO - __main__ - ***** Loading pre-loaded datasets from the disk directly! *****
03/18/2021 02:43:14 - INFO - __main__ - ***** Inoculation Sample Count: 159274 *****
Loading cached shuffled indices for dataset at ../data-files/sst-tenary-corrupted-S3/train/cache-cd54032ebd6257b5.arrow
Loading cached shuffled indices for dataset at ../data-files/sst-tenary-corrupted-S3/validation/cache-4581ae0bb7454f12.arrow
03/18/2021 02:43:15 - INFO - __main__ - ***** Train Sample Count (Verify): 159274 *****
03/18/2021 02:43:15 - INFO - __main__ - ***** Valid Sample Count (Verify): 1100 *****
100%|██████████| 1100/1100 [00:01<00:00, 1088.40it/s]
03/18/2021 02:43:16 - INFO - __main__ - ***** Evaluation With Corrupt Data *****


              precision    recall  f1-score   support

           0    0.56250   0.02103   0.04054       428
           1    0.30435   0.03153   0.05714       444
           2    0.20520   0.93421   0.33649       228

    accuracy                        0.21455      1100
   macro avg    0.35735   0.32892   0.14473      1100
weighted avg    0.38424   0.21455   0.10858      1100

Macro-F1:  0.14472542955955278


#### Random guessing baseline
If we randomly guess the lables, what is the performance now?

In [111]:
# getting avg mF1 on the dataset with a dummy classifier
import numpy as np
from sklearn.dummy import DummyClassifier

mf1s = []
runs = 100
for i in range(runs):
    dummy_clf = DummyClassifier(strategy="stratified")
    dummy_clf.fit(validation_input_features, validation_label_ids)
    dummy_labels = dummy_clf.predict(validation_input_features)

    # dummy performance
    # print(classification_report(validation_label_ids, dummy_labels, digits=5))
    result_to_save = classification_report(validation_label_ids, dummy_labels, digits=5, output_dict=True)
    mf1s += [result_to_save["macro avg"]["f1-score"]]
print(classification_report(validation_label_ids, dummy_labels, digits=5))
print(f"AVG over {runs} runs mF1: {round(sum(mf1s)/len(mf1s), 6)}.")

              precision    recall  f1-score   support

           0    0.38519   0.36449   0.37455       428
           1    0.40088   0.40991   0.40535       444
           2    0.20747   0.21930   0.21322       228

    accuracy                        0.35273      1100
   macro avg    0.33118   0.33123   0.33104      1100
weighted avg    0.35468   0.35273   0.35354      1100

AVG over 100 runs mF1: 0.331816.


#### FrequencyBoW classifiers

In [130]:
# task setups
task_name = "sst3"
num_labels = 3
FILENAME_CONFIG = {
    "sst3" : "sst-tenary"
}

# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                    f"{FILENAME_CONFIG[task_name]}-train.tsv"), 
                       delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                   f"{FILENAME_CONFIG[task_name]}-dev.tsv"), 
                      delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                   f"{FILENAME_CONFIG[task_name]}-test.tsv"), 
                      delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

In [131]:
modified_basic_tokenizer = ModifiedBasicTokenizer()
label_vocab_map = {}
token_frequency_map = {} # overwrite this everytime for a new dataset
for i, example in enumerate(train_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
for i, example in enumerate(eval_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
for i, example in enumerate(test_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
task_token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
task_token_frequency_map = OrderedDict(task_token_frequency_map)

processing #10000 example...
processing #20000 example...
processing #30000 example...
processing #40000 example...
processing #50000 example...
processing #60000 example...
processing #70000 example...
processing #80000 example...
processing #90000 example...
processing #100000 example...
processing #110000 example...
processing #120000 example...
processing #130000 example...
processing #140000 example...
processing #150000 example...


training BoW with 1st order frequency bins

In [293]:
# freq and bucket mappings
freq_set = set([])
for k, v in task_token_frequency_map.items():
    freq_set.add(v)
freq_set = list(freq_set)
freq_set.sort()
bucket_count = 256
freq_bucket = np.logspace(math.log(freq_set[0], 10), math.log(freq_set[-1], 10), bucket_count, endpoint=True)
freq_bucket = freq_bucket[:-1]
freq_bucket = [math.ceil(n) for n in freq_bucket]
# finally the bucket is a map between freq and bucket number
def find_bucket_number(freq, freq_bucket):
    for i in range(len(freq_bucket)):
        if freq > freq_bucket[i]:
            continue
        else:
            return i+1
    return len(freq_bucket)

new_bucket_idx = 0
freq_bucket_map = {}
for freq in freq_set:
    # bucket_num = find_bucket_number(freq, freq_bucket)
    freq_bucket_map[freq] = new_bucket_idx
    new_bucket_idx += 1

bucket_length = new_bucket_idx # len(freq_bucket)

In [294]:
# FBoW feature vectors for train split
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(train_df)):
    bow_feature = torch.zeros(bucket_length)
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    for t in sentence_tokens:
        bow_feature[freq_bucket_map[token_frequency_map[t]]] += 1 # bucket count
    if ex_index % 50000 == 0:
        print("Example sentence: " + sentence_combined)
        print(bow_feature)
    train_input_features.append(bow_feature)
    train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

  0%|          | 611/159274 [00:00<00:53, 2963.90it/s]

Example sentence: This is one of the year's best films.
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

 32%|███▏      | 50484/159274 [00:16<00:35, 3065.35it/s]

Example sentence: there is plenty of room for editing, and a much shorter cut surely would have resulted in a smoother, more focused narrative without sacrificing any of the cultural intrigue
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        

 63%|██████▎   | 100566/159274 [00:31<00:13, 4436.31it/s]

Example sentence: his name was, uh, Michael Zaidan, was supposed to have like written the screenplay or something
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
      

 95%|█████████▍| 150612/159274 [00:43<00:02, 4325.13it/s]

Example sentence: They just don't work in concert.
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

100%|██████████| 159274/159274 [00:45<00:00, 3508.71it/s]


In [295]:
# FBoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(eval_df)):
    bow_feature = torch.zeros(bucket_length)
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    for t in sentence_tokens:
        bow_feature[freq_bucket_map[token_frequency_map[t]]] += 1 # bucket count
    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

100%|██████████| 1100/1100 [00:00<00:00, 2071.74it/s]


In [296]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

In [297]:
# some overriding fun stuffs!
lr = 1e-3
num_train_epochs = 20
model = BOWClassifier(len(validation_label_ids.unique()), bucket_length)
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

In [298]:
global_step = 0
max_score = -1
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            # logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            # print(classification_report(all_label_ids, all_logits, digits=5))
            print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])
            if result_to_save["macro avg"]["f1-score"] > max_score:
                max_score = result_to_save["macro avg"]["f1-score"]
                    
        global_step += 1
print("Best Macro-F1: ", max_score)

Macro-F1:  0.29362903858075845
Macro-F1:  0.4815528782310475
Macro-F1:  0.505620890184787
Macro-F1:  0.5055478521616278
Macro-F1:  0.5158001378977433
Macro-F1:  0.5153792389952998
Macro-F1:  0.5238301864251756
Macro-F1:  0.5163669868353168
Macro-F1:  0.5132942135306541
Macro-F1:  0.5161602720484956
Macro-F1:  0.5219554069668625
Macro-F1:  0.5283716932135551
Macro-F1:  0.5217954766846714
Macro-F1:  0.5182836617860059
Macro-F1:  0.5201747288124605
Macro-F1:  0.5246612808098992
Macro-F1:  0.5194165512632037
Macro-F1:  0.5180882931234276
Macro-F1:  0.5197036593727948
Macro-F1:  0.5228242647642586
Macro-F1:  0.5251983411497888
Macro-F1:  0.516990662176519
Macro-F1:  0.5166227877916107
Macro-F1:  0.5219430344564765
Macro-F1:  0.5201008589034574
Macro-F1:  0.5265580483728647
Macro-F1:  0.5194328847335722
Macro-F1:  0.5227367621249202
Macro-F1:  0.515849535539506
Macro-F1:  0.5192073275478656
Macro-F1:  0.5241944157222207
Macro-F1:  0.5199166068415538
Macro-F1:  0.5201548005406526
Macro-F1:  0

training BoW with 1st and 2nd order frequency bins

In [259]:
# repartition the first order information
second_order_freq_set = set([])
for k, v in task_token_frequency_map.items():
    second_order_freq_set.add(v)
second_order_freq_set = list(second_order_freq_set)
second_order_freq_set.sort()
temp_bucket_count = 24
second_order_freq_bucket = np.logspace(math.log(second_order_freq_set[0], 10), 
                          math.log(second_order_freq_set[-1], 10), temp_bucket_count+1, 
                          endpoint=True)
second_order_freq_bucket = second_order_freq_bucket[:-1]
second_order_freq_bucket = [math.ceil(n) for n in second_order_freq_bucket]
# finally the bucket is a map between freq and bucket number
def find_bucket_number(freq, freq_bucket):
    for i in range(len(freq_bucket)):
        if freq > freq_bucket[i]:
            continue
        else:
            return i+1
    return len(freq_bucket)

second_order_freq_bucket_map = {}
for freq in second_order_freq_set:
    bucket_num = find_bucket_number(freq, second_order_freq_bucket)
    second_order_freq_bucket_map[freq] = bucket_num

In [260]:
modified_basic_tokenizer = ModifiedBasicTokenizer()
token_freq_freq_map = {} # overwrite this everytime for a new dataset
for i, example in enumerate(train_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
for i, example in enumerate(eval_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
for i, example in enumerate(test_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
task_token_freq_freq_map = sorted(token_freq_freq_map.items(), key=operator.itemgetter(1), reverse=True)
task_token_freq_freq_map = OrderedDict(task_token_freq_freq_map)

processing #10000 example...
processing #20000 example...
processing #30000 example...
processing #40000 example...
processing #50000 example...
processing #60000 example...
processing #70000 example...
processing #80000 example...
processing #90000 example...
processing #100000 example...
processing #110000 example...
processing #120000 example...
processing #130000 example...
processing #140000 example...
processing #150000 example...


In [261]:
# repartition the first order information
second_order_freq_freq_set = set([])
for k, v in task_token_freq_freq_map.items():
    second_order_freq_freq_set.add(v)
second_order_freq_freq_set = list(second_order_freq_freq_set)
second_order_freq_freq_set.sort()
# second_order_freq_freq_set = second_order_freq_freq_set[::-1]
# bucket_count = 48
# second_order_freq_freq_bucket = np.logspace(0, 
#                           math.log(len(second_order_freq_freq_set), 10), bucket_count, 
#                           endpoint=True)
# second_order_freq_freq_bucket = second_order_freq_freq_bucket[:-1]
# second_order_freq_freq_bucket = [math.ceil(n) for n in second_order_freq_freq_bucket]
# for i in range(1, len(second_order_freq_freq_bucket)):
#     if second_order_freq_freq_bucket[i] == second_order_freq_freq_bucket[i-1]:
#         second_order_freq_freq_bucket[i] += 1
# second_order_freq_freq_bucket += [len(second_order_freq_freq_set)]
# start = 0
# bucket_count = 0
# second_order_freq_freq_bucket_map = {}
# for i in range(len(second_order_freq_freq_bucket)):
#     end = second_order_freq_freq_bucket[i]
#     bucket_freqs = second_order_freq_freq_set[start:second_order_freq_freq_bucket[i]]
#     for freq in bucket_freqs:
#         second_order_freq_freq_bucket_map[freq] = bucket_count+1
#     bucket_count += 1
#     start = second_order_freq_freq_bucket[i]
second_order_freq_freq_bucket_map = {}
new_bucket_idx = 0
freq_bucket_map = {}
for freq in second_order_freq_freq_set:
    # bucket_num = find_bucket_number(freq, freq_bucket)
    second_order_freq_freq_bucket_map[freq] = new_bucket_idx
    new_bucket_idx += 1

bucket_length = new_bucket_idx # len(freq_bucket)
# the code above create second order buckets, now we can create second order BoW vectors!

In [262]:
# FBoW feature vectors for train split (2nd order = 1st order concat with 2nd order)
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(train_df)):
    bow_feature = torch.zeros(bucket_length) # up-to 2nd feature map
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    # first order here!
#     for t in sentence_tokens:
#         bow_feature[freq_bucket_map[token_frequency_map[t]]-1] += 1 # bucket count
    # awesome :) second order here!
    for i in range(len(sentence_tokens)-1):
        for j in range(i+1, len(sentence_tokens)):
            t1 = sentence_tokens[i]
            t2 = sentence_tokens[j]
            index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], second_order_freq_bucket_map[token_frequency_map[t2]]]
            index_tuple.sort()
            index_tuple = tuple(index_tuple)
            second_order_bucket = second_order_freq_freq_bucket_map[task_token_freq_freq_map[index_tuple]]
            bow_feature[second_order_bucket] += 1 # bucket count

    if ex_index % 50000 == 0:
        print("Example sentence: " + sentence_combined)
        print(bow_feature)
    train_input_features.append(bow_feature)
    train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

  0%|          | 223/159274 [00:00<02:19, 1144.25it/s]

Example sentence: This is one of the year's best films.
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 

 32%|███▏      | 50226/159274 [00:45<01:29, 1215.69it/s]

Example sentence: there is plenty of room for editing, and a much shorter cut surely would have resulted in a smoother, more focused narrative without sacrificing any of the cultural intrigue
tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,
         1.,  0.,  1.,  1.,  1.,  0.,  1.,  0.,  2.,  6.,  1.,  0.,  2.,  2.,
         2.,  1.,  1.,  1., 

 63%|██████▎   | 100247/159274 [01:27<00:47, 1247.10it/s]

Example sentence: his name was, uh, Michael Zaidan, was supposed to have like written the screenplay or something
tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  2.,  0.,
         0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0., 

 94%|█████████▍| 150178/159274 [02:12<00:07, 1167.21it/s]

Example sentence: They just don't work in concert.
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

100%|██████████| 159274/159274 [02:22<00:00, 1119.18it/s]


In [263]:
# FBoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(eval_df)):
    bow_feature = torch.zeros(bucket_length) # up-to 2nd feature map
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    # first order here!
#     for t in sentence_tokens:
#         bow_feature[freq_bucket_map[token_frequency_map[t]]] += 1 # bucket count
    # awesome :) second order here!
    for i in range(len(sentence_tokens)-1):
        for j in range(i+1, len(sentence_tokens)):
            t1 = sentence_tokens[i]
            t2 = sentence_tokens[j]
            index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], second_order_freq_bucket_map[token_frequency_map[t2]]]
            index_tuple.sort()
            index_tuple = tuple(index_tuple)
            second_order_bucket = second_order_freq_freq_bucket_map[task_token_freq_freq_map[index_tuple]]
            bow_feature[second_order_bucket] += 1 # bucket count

    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

100%|██████████| 1100/1100 [00:03<00:00, 275.74it/s]


In [264]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

In [265]:
# restart the model
model = BOWClassifier(len(validation_label_ids.unique()), 
                      bucket_length)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

In [266]:
global_step = 0
num_train_epochs = 20
max_score = -1
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            # logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            # print(classification_report(all_label_ids, all_logits, digits=5))
            print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])
            if result_to_save["macro avg"]["f1-score"] > max_score:
                max_score = result_to_save["macro avg"]["f1-score"]
                    
        global_step += 1
print("Best Macro-F1: ", max_score)

Macro-F1:  0.25873279749620853
Macro-F1:  0.3433746682801147
Macro-F1:  0.31520884150069767
Macro-F1:  0.3291332569683085
Macro-F1:  0.3463714061424099
Macro-F1:  0.32128340114559434
Macro-F1:  0.3415247552395213
Macro-F1:  0.34479962453282703
Macro-F1:  0.36502200120066014
Macro-F1:  0.34001388113453407
Macro-F1:  0.3493049053985164
Macro-F1:  0.3487897051540782
Macro-F1:  0.3479800452794131
Macro-F1:  0.3324020343123328
Macro-F1:  0.3597146385637638
Macro-F1:  0.33780756363589975
Macro-F1:  0.3486081302794075
Macro-F1:  0.3541380630972311
Macro-F1:  0.3556082327706669
Macro-F1:  0.36584699421534056
Macro-F1:  0.33734529846593353
Macro-F1:  0.3325437764001331
Macro-F1:  0.3524552669686695
Macro-F1:  0.366254387697229
Macro-F1:  0.33063860088555175
Macro-F1:  0.3578676157061473
Macro-F1:  0.3384012497650528
Macro-F1:  0.35435357464465356
Macro-F1:  0.3376253936250881
Macro-F1:  0.34471741509248205
Macro-F1:  0.33771097987796383
Macro-F1:  0.3350617336387885
Macro-F1:  0.363558165491051