In [7]:
from vocab_mismatch_utils import *
from data_formatter_utils import *
from datasets import DatasetDict
from datasets import Dataset
from datasets import load_dataset
import transformers
import pandas as pd
from collections import OrderedDict
import operator

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss

# Load modules, mainly huggingface basic model handlers.
# Make sure you install huggingface and other packages properly.
from collections import Counter
import json

from nltk.tokenize import TweetTokenizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import matthews_corrcoef
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import statistics

import logging
logger = logging.getLogger(__name__)

import os
os.environ["TRANSFORMERS_CACHE"] = "../huggingface_cache/" # Not overload common dir 
                                                           # if run in shared resources.

import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import torch
import argparse
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
from datasets import Dataset
from datasets import DatasetDict
from tqdm import tqdm, trange

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    EarlyStoppingCallback
)
from transformers.trainer_utils import is_main_process, EvaluationStrategy

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
font = {'family' : 'Times New Roman',
        'size'   : 30}
plt.rc('font', **font)

#### Setups

In [8]:
task_name = "snli"

In [9]:
def get_dataset(inoculation_data_path, eval_data_path=None, test_data_path=None,
                inoculation_step_sample_size=1.0, 
                eval_sample_limit=-1, seed=42):
    """
    eval_data_path is not needed if it is a saved_to_disk 
    huggingface dataset.
    
    return type is already a huggingface dataset.
    """
    pd_format = True
    if inoculation_data_path.split(".")[-1] != "tsv":
        if len(inoculation_data_path.split(".")) > 1:
            logger.info(f"***** Loading pre-loaded datasets from the disk directly! *****")
            pd_format = False
            datasets = DatasetDict.load_from_disk(inoculation_data_path)
            inoculation_step_sample_size = int(len(datasets["train"]) * inoculation_step_sample_size)
            logger.info(f"***** Inoculation Sample Count: %s *****"%(inoculation_step_sample_size))
            # this may not always start for zero inoculation
            datasets["train"] = datasets["train"].shuffle(seed=seed)
            inoculation_train_df = datasets["train"].select(range(inoculation_step_sample_size))
            eval_df = datasets["validation"]
            datasets["validation"] = datasets["validation"].shuffle(seed=seed)
            if eval_sample_limit != -1:
                datasets["validation"] = datasets["validation"].select(range(eval_sample_limit))
        else:
            logger.info(f"***** Loading downloaded huggingface datasets: {inoculation_data_path}! *****")
            pd_format = False
            if inoculation_data_path in ["sst3", "cola", "mnli", "snli", "mrps", "qnli"]:
                pass
            raise NotImplementedError()
    else:
        train_df = pd.read_csv(inoculation_data_path, delimiter="\t")
        eval_df = pd.read_csv(eval_data_path, delimiter="\t")
        test_df = pd.read_csv(test_data_path, delimiter="\t")
        inoculation_step_sample_size = int(len(train_df) * inoculation_step_sample_size)
        logger.info(f"***** Inoculation Sample Count: %s *****"%(inoculation_step_sample_size))
        # this may not always start for zero inoculation
        inoculation_train_df = train_df.sample(n=inoculation_step_sample_size, 
                                               replace=False, 
                                               random_state=seed) # seed here could not a little annoying.
    if pd_format:
        datasets = {}
        datasets["train"] = Dataset.from_pandas(inoculation_train_df)
        datasets["validation"] = Dataset.from_pandas(eval_df)
        datasets["test"] = Dataset.from_pandas(test_df)
    else:
        datasets = {}
        datasets["train"] = inoculation_train_df
        datasets["validation"] = eval_df
    return datasets

In [10]:
TASK_CONFIG = {
    "sst3": ("text", None),
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "snli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence")
}
# WARNING: you dont need BERT tokenizer
# original_vocab = load_bert_vocab("../data-files/bert_vocab.txt")
# original_tokenizer = transformers.BertTokenizer(
#     vocab_file="../data-files/bert_vocab.txt")
# Just use some basic white space tokenizor here!
modified_basic_tokenizer = ModifiedBasicTokenizer()
max_length = 128
per_device_train_batch_size = 128
per_device_eval_batch_size = 128
no_cuda = True
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
n_gpu = torch.cuda.device_count() if not no_cuda else 1 # 1 means just on cpu
seed = 42
lr = 1e-3
num_train_epochs = 10
sentence1_key, sentence2_key = TASK_CONFIG[task_name]

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0 and not no_cuda:
    torch.cuda.manual_seed_all(args.seed)

In [11]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
datasets = get_dataset(f"../data-files/{data_file_name}/{data_file_name}-train.tsv", 
                       f"../data-files/{data_file_name}/{data_file_name}-dev.tsv", 
                       f"../data-files/{data_file_name}/{data_file_name}-test.tsv")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))
logger.info(f"***** Test Sample Count (Verify): %s *****"%(len(datasets["test"])))

03/29/2021 11:25:36 - INFO - __main__ - ***** Inoculation Sample Count: 550152 *****
03/29/2021 11:25:37 - INFO - __main__ - ***** Train Sample Count (Verify): 550152 *****
03/29/2021 11:25:37 - INFO - __main__ - ***** Valid Sample Count (Verify): 10000 *****
03/29/2021 11:25:37 - INFO - __main__ - ***** Test Sample Count (Verify): 10000 *****


#### BoW preprocessor

In [12]:
def sanity_check_non_empty(sentece):
    if sentece != None and sentece.strip() != "" and sentece.strip() != "None":
        return True
    return False

# create the vocab file
vocab_index = 0
original_vocab = OrderedDict()
if "train" in datasets:
    for (ex_index, example) in enumerate(tqdm(datasets["train"])):
        if sentence2_key is None:
            if sanity_check_non_empty(example[sentence1_key]):
                sentence_combined = example[sentence1_key]
        else:
            s1 = ""
            s2 = ""
            if sanity_check_non_empty(example[sentence1_key]):
                s1 = example[sentence1_key]
            if sanity_check_non_empty(example[sentence2_key]):
                s2 = example[sentence2_key]
            sentence_combined = s1 + " [SEP] " + s2
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        for token in sentence_tokens:
            if token not in original_vocab.keys():
                original_vocab[token] = vocab_index
                vocab_index += 1
train_data_only = False
if not train_data_only:
    if "validation" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["validation"])):
            if sentence2_key is None:
                if sanity_check_non_empty(example[sentence1_key]):
                    sentence_combined = example[sentence1_key]
            else:
                s1 = ""
                s2 = ""
                if sanity_check_non_empty(example[sentence1_key]):
                    s1 = example[sentence1_key]
                if sanity_check_non_empty(example[sentence2_key]):
                    s2 = example[sentence2_key]
                sentence_combined = s1 + " [SEP] " + s2
            sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

    if "test" in datasets:
        for (ex_index, example) in enumerate(tqdm(datasets["test"])):
            if sentence2_key is None:
                if sanity_check_non_empty(example[sentence1_key]):
                    sentence_combined = example[sentence1_key]
            else:
                s1 = ""
                s2 = ""
                if sanity_check_non_empty(example[sentence1_key]):
                    s1 = example[sentence1_key]
                if sanity_check_non_empty(example[sentence2_key]):
                    s2 = example[sentence2_key]
                sentence_combined = s1 + " [SEP] " + s2
            sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
            for token in sentence_tokens:
                if token not in original_vocab.keys():
                    original_vocab[token] = vocab_index
                    vocab_index += 1

100%|██████████| 550152/550152 [02:39<00:00, 3447.79it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3387.15it/s]
100%|██████████| 10000/10000 [00:02<00:00, 3402.54it/s]


In [13]:
# BoW feature vectors for train split
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(datasets["train"])):
    if sentence2_key is None:
        bow_feature = torch.zeros(len(original_vocab))
        if sanity_check_non_empty(example[sentence1_key]):
            sentence_combined = example[sentence1_key]
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        if ex_index % 50000 == 0:
            print("Example sentence: " + sentence_combined)
        for t in sentence_tokens:
            bow_feature[original_vocab[t]] += 1
        train_input_features.append(bow_feature)
        train_label_ids.append(example["label"])
    else:
        bow_feature_1 = torch.zeros(len(original_vocab))
        bow_feature_2 = torch.zeros(len(original_vocab))
        s1 = ""
        s2 = ""
        if sanity_check_non_empty(example[sentence1_key]):
            s1 = example[sentence1_key]
        if sanity_check_non_empty(example[sentence2_key]):
            s2 = example[sentence2_key]
        s1_tokens = modified_basic_tokenizer.tokenize(s1)
        s2_tokens = modified_basic_tokenizer.tokenize(s2)
        if ex_index % 50000 == 0:
            print("Example sentence 1: " + s1)
            print("Example sentence 2: " + s2)
        for t in s1_tokens:
            bow_feature_1[original_vocab[t]] += 1
        for t in s2_tokens:
            bow_feature_2[original_vocab[t]] += 1
        bow_feature = torch.cat([bow_feature_1, bow_feature_2], dim=-1)
        train_input_features.append(bow_feature)
        train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

  0%|          | 207/550152 [00:00<09:01, 1014.84it/s]

Example sentence 1: People in the Oregon Subway are wearing hats& jackets.
Example sentence 2: the people are naked


  9%|▉         | 50185/550152 [00:54<08:15, 1008.32it/s]

Example sentence 1: A man in a black shirt and a cast smokes a cigarette.
Example sentence 2: a man smokes a cigar


 18%|█▊        | 100120/550152 [01:47<08:04, 929.17it/s]

Example sentence 1: A man in a black t-shirt is standing next to a parking meter.
Example sentence 2: A man is outside standing next to a parking meter


 27%|██▋       | 150114/550152 [04:42<07:37, 874.98it/s]  

Example sentence 1: Women in black dress taking picture of children in front of flowers.
Example sentence 2: The flowers are roses.


 36%|███▋      | 200105/550152 [05:52<07:07, 818.40it/s] 

Example sentence 1: This clown is proud to be entertaining the crowd.
Example sentence 2: a clown entertains a crowd


 45%|████▌     | 250104/550152 [06:46<04:54, 1020.03it/s]

Example sentence 1: A couple holding hands as they look out over the water towards the sky while the trees reflect in the lake and a boat rests comfortably on the shore.
Example sentence 2: Two people look at the sky above a lake as they hold hands and a boat is on the shore.


 55%|█████▍    | 300195/550152 [07:39<04:24, 946.49it/s] 

Example sentence 1: A guy and girl on rocks fishing in a body of water.
Example sentence 2: Two people are fishing.


 64%|██████▎   | 350132/550152 [08:32<03:15, 1022.32it/s]

Example sentence 1: A brown dog walks in the under brush.
Example sentence 2: A brown dog walks in the under brush next to the stream.


 73%|███████▎  | 400162/550152 [09:24<02:35, 965.08it/s] 

Example sentence 1: Two women dressed as geishas.
Example sentence 2: Two ladies in costume.


 82%|████████▏ | 450194/550152 [10:19<01:38, 1013.64it/s]

Example sentence 1: A young girl is playing with a soccer ball in the grass.
Example sentence 2: A person is engaged in an activity outside.


 91%|█████████ | 500128/550152 [11:13<01:02, 801.90it/s] 

Example sentence 1: A group of kids on a merry-go-round, playing at the park.
Example sentence 2: A group of kids are playing on the merry-go-round and enjoying themselves


100%|██████████| 550152/550152 [12:07<00:00, 756.73it/s] 

Example sentence 1: A man in a baseball cap and sunglasses in standing inside a candy kiosk.
Example sentence 2: a couple was there





In [14]:
# BoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(datasets["validation"])):
    if sentence2_key is None:
        bow_feature = torch.zeros(len(original_vocab))
        if sanity_check_non_empty(example[sentence1_key]):
            sentence_combined = example[sentence1_key]
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        if ex_index % 50000 == 0:
            print("Example sentence: " + sentence_combined)
        for t in sentence_tokens:
            bow_feature[original_vocab[t]] += 1
    else:
        bow_feature_1 = torch.zeros(len(original_vocab))
        bow_feature_2 = torch.zeros(len(original_vocab))
        s1 = ""
        s2 = ""
        if sanity_check_non_empty(example[sentence1_key]):
            s1 = example[sentence1_key]
        if sanity_check_non_empty(example[sentence2_key]):
            s2 = example[sentence2_key]
        s1_tokens = modified_basic_tokenizer.tokenize(s1)
        s2_tokens = modified_basic_tokenizer.tokenize(s2)
        if ex_index % 50000 == 0:
            print("Example sentence 1: " + s1)
            print("Example sentence 2: " + s2)
        for t in s1_tokens:
            bow_feature_1[original_vocab[t]] += 1
        for t in s2_tokens:
            bow_feature_2[original_vocab[t]] += 1
        bow_feature = torch.cat([bow_feature_1, bow_feature_2], dim=-1)
    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

    
    
validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

  0%|          | 0/10000 [00:00<?, ?it/s]

Example sentence 1: An old woman drinking water in a yellow room
Example sentence 2: A n old woman is waitng for someone in a yellow room.


100%|██████████| 10000/10000 [00:16<00:00, 599.95it/s]


In [15]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

#### BoW Classifer

In [16]:
class BOWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BOWClassifier, self).__init__()
        self.classifier = nn.Linear(vocab_size, num_labels, bias=True)
    def forward(self, x, labels=None):
        logits = self.classifier(x)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits

In [17]:
class MockBERTBOWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(MockBERTBOWClassifier, self).__init__()
        hidden_dim = 32
        self.mock_bert = nn.Linear(vocab_size, hidden_dim, bias=False)
        self.mock_activation = nn.Tanh()
        self.classifier = nn.Linear(hidden_dim, num_labels, bias=False)
    def forward(self, x, labels=None):
        cls = self.mock_activation(self.mock_bert(x))
        logits = self.classifier(cls)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits

In [18]:
# some overriding fun stuffs!
lr = 1e-3
num_train_epochs = 20
if sentence2_key is None:
    in_dim = len(original_vocab)
else:
    in_dim = len(original_vocab) * 2
model = BOWClassifier(len(validation_label_ids.unique()), in_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

#### Main training loop

In [None]:
global_step = 0
best_f1 = -1
best_mcc = -1
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            print(classification_report(all_label_ids, all_logits, digits=5))
            f1 = result_to_save["macro avg"]["f1-score"]
            print("Macro-F1: ", f1)
            best_f1 = f1 if f1 > best_f1 else best_f1
            mcc = matthews_corrcoef(all_label_ids, all_logits)
            best_mcc = mcc if mcc > best_mcc else best_mcc
            print("MCC: ", mcc)
                    
        global_step += 1
print("Best Macro-F1: ", best_f1)
print("Best MCC: ", best_mcc)

03/29/2021 11:43:06 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.32723   0.38530   0.35390      3387
           1    0.33766   0.19084   0.24385      3275
           2    0.33357   0.41582   0.37018      3338

    accuracy                        0.33180     10000
   macro avg    0.33282   0.33065   0.32265     10000
weighted avg    0.33276   0.33180   0.32329     10000

Macro-F1:  0.32264528456632635
MCC:  -0.004290070354811539


03/29/2021 11:43:45 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.55177   0.66401   0.60271      3387
           1    0.58408   0.56000   0.57178      3275
           2    0.59662   0.49760   0.54263      3338

    accuracy                        0.57440     10000
   macro avg    0.57749   0.57387   0.57237     10000
weighted avg    0.57732   0.57440   0.57253     10000

Macro-F1:  0.5723748973126153
MCC:  0.3633237562892885


03/29/2021 11:44:49 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.57590   0.67316   0.62075      3387
           1    0.63606   0.52885   0.57753      3275
           2    0.58620   0.58268   0.58444      3338

    accuracy                        0.59570     10000
   macro avg    0.59939   0.59490   0.59424     10000
weighted avg    0.59904   0.59570   0.59447     10000

Macro-F1:  0.5942356407700852
MCC:  0.3948012587752743


03/29/2021 11:45:54 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.60636   0.63626   0.62095      3387
           1    0.62749   0.57710   0.60124      3275
           2    0.58998   0.60695   0.59835      3338

    accuracy                        0.60710     10000
   macro avg    0.60794   0.60677   0.60684     10000
weighted avg    0.60781   0.60710   0.60695     10000

Macro-F1:  0.6068449255506061
MCC:  0.41070310967150603


03/29/2021 11:47:00 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.60473   0.66490   0.63338      3387
           1    0.64748   0.55634   0.59846      3275
           2    0.60023   0.62253   0.61118      3338

    accuracy                        0.61520     10000
   macro avg    0.61748   0.61459   0.61434     10000
weighted avg    0.61723   0.61520   0.61453     10000

Macro-F1:  0.6143391999259153
MCC:  0.42343814570694605


03/29/2021 11:48:05 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.61585   0.66312   0.63861      3387
           1    0.63810   0.60031   0.61863      3275
           2    0.61767   0.60545   0.61150      3338

    accuracy                        0.62330     10000
   macro avg    0.62387   0.62296   0.62291     10000
weighted avg    0.62374   0.62330   0.62302     10000

Macro-F1:  0.6229127508247225
MCC:  0.43504292605624006


03/29/2021 11:49:09 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.62234   0.67434   0.64730      3387
           1    0.62485   0.64031   0.63248      3275
           2    0.65098   0.57999   0.61343      3338

    accuracy                        0.63170     10000
   macro avg    0.63272   0.63155   0.63107     10000
weighted avg    0.63272   0.63170   0.63114     10000

Macro-F1:  0.631073022804388
MCC:  0.44816257331405024


03/29/2021 11:50:14 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.61607   0.69265   0.65212      3387
           1    0.64846   0.59817   0.62230      3275
           2    0.63450   0.60276   0.61822      3338

    accuracy                        0.63170     10000
   macro avg    0.63301   0.63119   0.63088     10000
weighted avg    0.63283   0.63170   0.63104     10000

Macro-F1:  0.6308801084193557
MCC:  0.4481022617936845


03/29/2021 11:51:18 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.62635   0.66962   0.64726      3387
           1    0.64731   0.61252   0.62943      3275
           2    0.62896   0.61803   0.62345      3338

    accuracy                        0.63370     10000
   macro avg    0.63421   0.63339   0.63338     10000
weighted avg    0.63408   0.63370   0.63347     10000

Macro-F1:  0.633381178487351
MCC:  0.450613519056377


03/29/2021 11:52:01 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.62061   0.70416   0.65975      3387
           1    0.66266   0.60641   0.63329      3275
           2    0.64525   0.61084   0.62758      3338

    accuracy                        0.64100     10000
   macro avg    0.64284   0.64047   0.64021     10000
weighted avg    0.64261   0.64100   0.64035     10000

Macro-F1:  0.6402065232971014
MCC:  0.4622282916411569


03/29/2021 11:52:14 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.62487   0.70328   0.66176      3387
           1    0.66288   0.60641   0.63339      3275
           2    0.64380   0.61564   0.62940      3338

    accuracy                        0.64230     10000
   macro avg    0.64385   0.64178   0.64152     10000
weighted avg    0.64364   0.64230   0.64167     10000

Macro-F1:  0.6415177125429158
MCC:  0.4640660468428511


03/29/2021 11:52:27 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63612   0.69885   0.66601      3387
           1    0.66421   0.60641   0.63400      3275
           2    0.63576   0.62642   0.63105      3338

    accuracy                        0.64440     10000
   macro avg    0.64536   0.64389   0.64369     10000
weighted avg    0.64520   0.64440   0.64386     10000

Macro-F1:  0.643687769726379
MCC:  0.46695965511927817


03/29/2021 11:52:41 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64937   0.68350   0.66600      3387
           1    0.65025   0.62901   0.63945      3275
           2    0.64126   0.62762   0.63437      3338

    accuracy                        0.64700     10000
   macro avg    0.64696   0.64671   0.64661     10000
weighted avg    0.64695   0.64700   0.64675     10000

Macro-F1:  0.6466056569206943
MCC:  0.4704971852256345


03/29/2021 11:52:54 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63113   0.71833   0.67191      3387
           1    0.67836   0.60214   0.63798      3275
           2    0.64577   0.62642   0.63595      3338

    accuracy                        0.64960     10000
   macro avg    0.65175   0.64897   0.64861     10000
weighted avg    0.65148   0.64960   0.64880     10000

Macro-F1:  0.6486146589708038
MCC:  0.47528934940311607


03/29/2021 11:53:07 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64127   0.70830   0.67312      3387
           1    0.66513   0.61496   0.63906      3275
           2    0.64717   0.62642   0.63663      3338

    accuracy                        0.65040     10000
   macro avg    0.65119   0.64989   0.64960     10000
weighted avg    0.65105   0.65040   0.64978     10000

Macro-F1:  0.6496024779633355
MCC:  0.4759987790252849


03/29/2021 11:53:20 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64022   0.71243   0.67440      3387
           1    0.66459   0.61954   0.64128      3275
           2    0.65639   0.62493   0.64027      3338

    accuracy                        0.65280     10000
   macro avg    0.65373   0.65230   0.65198     10000
weighted avg    0.65360   0.65280   0.65216     10000

Macro-F1:  0.6519820249107687
MCC:  0.4796910564854792


03/29/2021 11:53:33 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65563   0.68970   0.67223      3387
           1    0.65500   0.62840   0.64142      3275
           2    0.64917   0.64080   0.64496      3338

    accuracy                        0.65330     10000
   macro avg    0.65326   0.65297   0.65287     10000
weighted avg    0.65326   0.65330   0.65304     10000

Macro-F1:  0.6528694912385027
MCC:  0.47995011075647626


03/29/2021 11:53:47 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64486   0.71302   0.67723      3387
           1    0.66483   0.62748   0.64562      3275
           2    0.65708   0.62283   0.63950      3338

    accuracy                        0.65490     10000
   macro avg    0.65559   0.65444   0.65411     10000
weighted avg    0.65548   0.65490   0.65428     10000

Macro-F1:  0.6541140902115808
MCC:  0.482772169851777


03/29/2021 11:53:57 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64202   0.71007   0.67433      3387
           1    0.66821   0.61802   0.64213      3275
           2    0.65364   0.63152   0.64239      3338

    accuracy                        0.65370     10000
   macro avg    0.65462   0.65320   0.65295     10000
weighted avg    0.65448   0.65370   0.65312     10000

Macro-F1:  0.6529505690644463
MCC:  0.4809735545698473


03/29/2021 11:54:06 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64791   0.71391   0.67931      3387
           1    0.66397   0.62687   0.64489      3275
           2    0.65901   0.62702   0.64262      3338

    accuracy                        0.65640     10000
   macro avg    0.65696   0.65593   0.65560     10000
weighted avg    0.65687   0.65640   0.65579     10000

Macro-F1:  0.6556041660397135
MCC:  0.48498260234282004


03/29/2021 11:54:16 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63817   0.72955   0.68081      3387
           1    0.67533   0.61099   0.64155      3275
           2    0.65624   0.62223   0.63878      3338

    accuracy                        0.65490     10000
   macro avg    0.65658   0.65426   0.65371     10000
weighted avg    0.65637   0.65490   0.65392     10000

Macro-F1:  0.6537146363050175
MCC:  0.483281254820079


03/29/2021 11:54:25 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65363   0.70534   0.67850      3387
           1    0.66667   0.62473   0.64502      3275
           2    0.64835   0.63631   0.64227      3338

    accuracy                        0.65590     10000
   macro avg    0.65621   0.65546   0.65526     10000
weighted avg    0.65614   0.65590   0.65544     10000

Macro-F1:  0.6552644352813896
MCC:  0.4840282622438338


03/29/2021 11:54:36 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63488   0.73516   0.68135      3387
           1    0.67739   0.61099   0.64248      3275
           2    0.66101   0.61863   0.63912      3338

    accuracy                        0.65560     10000
   macro avg    0.65776   0.65493   0.65432     10000
weighted avg    0.65752   0.65560   0.65452     10000

Macro-F1:  0.654317167266627
MCC:  0.4845772009113724


03/29/2021 11:54:46 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63655   0.73428   0.68193      3387
           1    0.68695   0.59969   0.64037      3275
           2    0.65213   0.63182   0.64181      3338

    accuracy                        0.65600     10000
   macro avg    0.65855   0.65526   0.65470     10000
weighted avg    0.65826   0.65600   0.65493     10000

Macro-F1:  0.6547030955796941
MCC:  0.48520957932855285


03/29/2021 11:54:57 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64230   0.73162   0.68406      3387
           1    0.66205   0.64244   0.65210      3275
           2    0.67780   0.60186   0.63758      3338

    accuracy                        0.65910     10000
   macro avg    0.66072   0.65864   0.65791     10000
weighted avg    0.66062   0.65910   0.65808     10000

Macro-F1:  0.6579110474850997
MCC:  0.4897440546035548


03/29/2021 11:55:07 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64321   0.73664   0.68676      3387
           1    0.68186   0.61649   0.64753      3275
           2    0.66361   0.62822   0.64543      3338

    accuracy                        0.66110     10000
   macro avg    0.66289   0.66045   0.65991     10000
weighted avg    0.66268   0.66110   0.66012     10000

Macro-F1:  0.6599066947880019
MCC:  0.4926467027756119


03/29/2021 11:55:31 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65008   0.71745   0.68211      3387
           1    0.66571   0.63481   0.64989      3275
           2    0.66359   0.62403   0.64320      3338

    accuracy                        0.65920     10000
   macro avg    0.65979   0.65876   0.65840     10000
weighted avg    0.65971   0.65920   0.65857     10000

Macro-F1:  0.658398288618581
MCC:  0.4892294788954749


03/29/2021 11:56:04 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63694   0.74432   0.68645      3387
           1    0.67661   0.62351   0.64898      3275
           2    0.67493   0.61144   0.64162      3338

    accuracy                        0.66040     10000
   macro avg    0.66283   0.65976   0.65902     10000
weighted avg    0.66261   0.66040   0.65921     10000

Macro-F1:  0.6590168508963711
MCC:  0.4920308114039479


03/29/2021 11:56:14 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.63529   0.74520   0.68587      3387
           1    0.68487   0.60519   0.64257      3275
           2    0.66486   0.62403   0.64380      3338

    accuracy                        0.65890     10000
   macro avg    0.66167   0.65814   0.65741     10000
weighted avg    0.66140   0.65890   0.65764     10000

Macro-F1:  0.6574108790503516
MCC:  0.4898532596009267


03/29/2021 11:56:59 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.66175   0.68911   0.67515      3387
           1    0.67133   0.63053   0.65029      3275
           2    0.64292   0.65428   0.64855      3338

    accuracy                        0.65830     10000
   macro avg    0.65867   0.65797   0.65800     10000
weighted avg    0.65860   0.65830   0.65813     10000

Macro-F1:  0.6579984990064845
MCC:  0.4874817251656666


03/29/2021 11:57:46 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65584   0.71509   0.68418      3387
           1    0.68186   0.62565   0.65255      3275
           2    0.65294   0.64590   0.64940      3338

    accuracy                        0.66270     10000
   macro avg    0.66355   0.66221   0.66204     10000
weighted avg    0.66339   0.66270   0.66221     10000

Macro-F1:  0.6620420506741782
MCC:  0.49438250479040863


03/29/2021 11:57:59 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64847   0.70859   0.67720      3387
           1    0.68356   0.60947   0.64439      3275
           2    0.64131   0.64919   0.64523      3338

    accuracy                        0.65630     10000
   macro avg    0.65778   0.65575   0.65561     10000
weighted avg    0.65757   0.65630   0.65578     10000

Macro-F1:  0.6556066883948298
MCC:  0.4849339329412591


03/29/2021 11:58:09 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65260   0.71213   0.68107      3387
           1    0.66794   0.64244   0.65494      3275
           2    0.66455   0.62792   0.64572      3338

    accuracy                        0.66120     10000
   macro avg    0.66170   0.66083   0.66058     10000
weighted avg    0.66161   0.66120   0.66071     10000

Macro-F1:  0.6605755950393255
MCC:  0.4921121632376836


03/29/2021 11:58:19 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65093   0.72070   0.68404      3387
           1    0.68932   0.61313   0.64900      3275
           2    0.64999   0.64979   0.64989      3338

    accuracy                        0.66180     10000
   macro avg    0.66341   0.66121   0.66098     10000
weighted avg    0.66319   0.66180   0.66116     10000

Macro-F1:  0.6609755382537718
MCC:  0.4933175937020019


03/29/2021 11:58:29 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64214   0.73162   0.68396      3387
           1    0.68701   0.61527   0.64916      3275
           2    0.65773   0.63212   0.64467      3338

    accuracy                        0.66030     10000
   macro avg    0.66229   0.65967   0.65926     10000
weighted avg    0.66204   0.66030   0.65945     10000

Macro-F1:  0.659264812362231
MCC:  0.49137819341913136


03/29/2021 11:58:39 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65482   0.71243   0.68241      3387
           1    0.66278   0.64275   0.65261      3275
           2    0.66422   0.62463   0.64382      3338

    accuracy                        0.66030     10000
   macro avg    0.66061   0.65993   0.65961     10000
weighted avg    0.66057   0.66030   0.65977     10000

Macro-F1:  0.6596126940510257
MCC:  0.49075179062741936


03/29/2021 11:58:48 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65985   0.70505   0.68170      3387
           1    0.66784   0.63664   0.65187      3275
           2    0.65480   0.63930   0.64696      3338

    accuracy                        0.66070     10000
   macro avg    0.66083   0.66033   0.66018     10000
weighted avg    0.66078   0.66070   0.66033     10000

Macro-F1:  0.6601767338952707
MCC:  0.49114924075432037


03/29/2021 11:58:58 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65000   0.72926   0.68735      3387
           1    0.68789   0.61038   0.64682      3275
           2    0.65695   0.64829   0.65259      3338

    accuracy                        0.66330     10000
   macro avg    0.66495   0.66264   0.66226     10000
weighted avg    0.66473   0.66330   0.66248     10000

Macro-F1:  0.6622555175077274
MCC:  0.4957231766452863


03/29/2021 11:59:08 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65199   0.72129   0.68489      3387
           1    0.67061   0.62351   0.64620      3275
           2    0.66272   0.63691   0.64956      3338

    accuracy                        0.66110     10000
   macro avg    0.66177   0.66057   0.66022     10000
weighted avg    0.66167   0.66110   0.66043     10000

Macro-F1:  0.6602162585646547
MCC:  0.49209407287932316


03/29/2021 11:59:18 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64365   0.73487   0.68624      3387
           1    0.67597   0.62870   0.65148      3275
           2    0.67250   0.62193   0.64623      3338

    accuracy                        0.66240     10000
   macro avg    0.66404   0.66183   0.66132     10000
weighted avg    0.66386   0.66240   0.66150     10000

Macro-F1:  0.6613156502007982
MCC:  0.49454974731274043


03/29/2021 11:59:27 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65280   0.72276   0.68600      3387
           1    0.66438   0.65160   0.65793      3275
           2    0.67874   0.61774   0.64680      3338

    accuracy                        0.66440     10000
   macro avg    0.66531   0.66403   0.66358     10000
weighted avg    0.66525   0.66440   0.66372     10000

Macro-F1:  0.6635780903532524
MCC:  0.4972312254350886


03/29/2021 11:59:38 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64268   0.74609   0.69053      3387
           1    0.68930   0.61374   0.64933      3275
           2    0.66720   0.63002   0.64807      3338

    accuracy                        0.66400     10000
   macro avg    0.66639   0.66328   0.66265     10000
weighted avg    0.66613   0.66400   0.66287     10000

Macro-F1:  0.6626450418054821
MCC:  0.49730250689655464


03/29/2021 11:59:47 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64451   0.74727   0.69210      3387
           1    0.68159   0.61832   0.64841      3275
           2    0.67408   0.62642   0.64938      3338

    accuracy                        0.66470     10000
   macro avg    0.66673   0.66400   0.66330     10000
weighted avg    0.66652   0.66470   0.66353     10000

Macro-F1:  0.6632970717103316
MCC:  0.4983066086199807


03/29/2021 11:59:58 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65714   0.71243   0.68367      3387
           1    0.66114   0.64580   0.65338      3275
           2    0.66699   0.62522   0.64543      3338

    accuracy                        0.66150     10000
   macro avg    0.66176   0.66115   0.66083     10000
weighted avg    0.66174   0.66150   0.66099     10000

Macro-F1:  0.6608265504661784
MCC:  0.4925424050383119


03/29/2021 12:00:08 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64760   0.73192   0.68718      3387
           1    0.67851   0.61802   0.64685      3275
           2    0.66071   0.63122   0.64563      3338

    accuracy                        0.66100     10000
   macro avg    0.66227   0.66038   0.65989     10000
weighted avg    0.66210   0.66100   0.66010     10000

Macro-F1:  0.6598857927955543
MCC:  0.4922641084813974


03/29/2021 12:00:17 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65551   0.72306   0.68763      3387
           1    0.66263   0.63511   0.64858      3275
           2    0.66816   0.62552   0.64614      3338

    accuracy                        0.66170     10000
   macro avg    0.66210   0.66123   0.66078     10000
weighted avg    0.66207   0.66170   0.66099     10000

Macro-F1:  0.6607841360416072
MCC:  0.49299410956920975


03/29/2021 12:00:27 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64946   0.72808   0.68653      3387
           1    0.67442   0.62870   0.65076      3275
           2    0.66603   0.62852   0.64673      3338

    accuracy                        0.66230     10000
   macro avg    0.66330   0.66177   0.66134     10000
weighted avg    0.66317   0.66230   0.66153     10000

Macro-F1:  0.6613388583579778
MCC:  0.4940815062223823


03/29/2021 12:00:37 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64123   0.74934   0.69108      3387
           1    0.68529   0.61435   0.64788      3275
           2    0.67096   0.62433   0.64680      3338

    accuracy                        0.66340     10000
   macro avg    0.66583   0.66267   0.66192     10000
weighted avg    0.66558   0.66340   0.66215     10000

Macro-F1:  0.6619227951227197
MCC:  0.4965272981061995


03/29/2021 12:00:47 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65730   0.70446   0.68006      3387
           1    0.67443   0.61863   0.64533      3275
           2    0.64706   0.65249   0.64976      3338

    accuracy                        0.65900     10000
   macro avg    0.65960   0.65852   0.65838     10000
weighted avg    0.65949   0.65900   0.65857     10000

Macro-F1:  0.6583832433930298
MCC:  0.48871839834350655


03/29/2021 12:00:57 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65056   0.72335   0.68503      3387
           1    0.67752   0.61649   0.64556      3275
           2    0.65734   0.64080   0.64897      3338

    accuracy                        0.66080     10000
   macro avg    0.66181   0.66022   0.65985     10000
weighted avg    0.66165   0.66080   0.66007     10000

Macro-F1:  0.6598530856838609
MCC:  0.4917467571843642


03/29/2021 12:01:06 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65154   0.73310   0.68991      3387
           1    0.68588   0.62137   0.65203      3275
           2    0.66294   0.63990   0.65122      3338

    accuracy                        0.66540     10000
   macro avg    0.66679   0.66479   0.66439     10000
weighted avg    0.66659   0.66540   0.66459     10000

Macro-F1:  0.664389327150369
MCC:  0.4988302921055858


03/29/2021 12:01:16 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64474   0.73782   0.68815      3387
           1    0.69404   0.60122   0.64431      3275
           2    0.65500   0.64500   0.64996      3338

    accuracy                        0.66210     10000
   macro avg    0.66459   0.66135   0.66080     10000
weighted avg    0.66431   0.66210   0.66104     10000

Macro-F1:  0.6608046471128511
MCC:  0.4943225568655907


03/29/2021 12:01:26 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65703   0.71833   0.68632      3387
           1    0.67961   0.63084   0.65432      3275
           2    0.65766   0.64170   0.64958      3338

    accuracy                        0.66410     10000
   macro avg    0.66477   0.66363   0.66341     10000
weighted avg    0.66464   0.66410   0.66358     10000

Macro-F1:  0.6634056329921197
MCC:  0.49647618954807765


03/29/2021 12:01:36 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64798   0.73694   0.68960      3387
           1    0.67433   0.62718   0.64990      3275
           2    0.67118   0.62373   0.64658      3338

    accuracy                        0.66320     10000
   macro avg    0.66449   0.66261   0.66203     10000
weighted avg    0.66435   0.66320   0.66224     10000

Macro-F1:  0.6620263432920798
MCC:  0.4956788737188715


03/29/2021 12:01:45 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64788   0.73989   0.69083      3387
           1    0.68930   0.60763   0.64589      3275
           2    0.65917   0.64080   0.64986      3338

    accuracy                        0.66350     10000
   macro avg    0.66545   0.66277   0.66219     10000
weighted avg    0.66521   0.66350   0.66244     10000

Macro-F1:  0.6621945955811749
MCC:  0.4962966471523401


03/29/2021 12:01:55 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64801   0.73162   0.68728      3387
           1    0.68521   0.61679   0.64920      3275
           2    0.65830   0.63661   0.64727      3338

    accuracy                        0.66230     10000
   macro avg    0.66384   0.66167   0.66125     10000
weighted avg    0.66363   0.66230   0.66146     10000

Macro-F1:  0.6612539052588983
MCC:  0.4942346843684296


03/29/2021 12:02:05 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64727   0.73900   0.69010      3387
           1    0.67389   0.63603   0.65441      3275
           2    0.67883   0.61863   0.64734      3338

    accuracy                        0.66510     10000
   macro avg    0.66666   0.66456   0.66395     10000
weighted avg    0.66652   0.66510   0.66414     10000

Macro-F1:  0.6639505035507924
MCC:  0.4986570234026075


03/29/2021 12:02:15 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65023   0.73605   0.69049      3387
           1    0.68798   0.62412   0.65450      3275
           2    0.66854   0.63990   0.65391      3338

    accuracy                        0.66730     10000
   macro avg    0.66892   0.66669   0.66630     10000
weighted avg    0.66871   0.66730   0.66649     10000

Macro-F1:  0.6662986251216498
MCC:  0.5017749595289251


03/29/2021 12:02:25 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65892   0.71125   0.68408      3387
           1    0.67831   0.63420   0.65552      3275
           2    0.65631   0.64530   0.65076      3338

    accuracy                        0.66400     10000
   macro avg    0.66451   0.66358   0.66345     10000
weighted avg    0.66440   0.66400   0.66360     10000

Macro-F1:  0.6634513340617684
MCC:  0.49620156788358893


03/29/2021 12:02:35 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65375   0.72749   0.68865      3387
           1    0.68735   0.61893   0.65135      3275
           2    0.65783   0.64679   0.65227      3338

    accuracy                        0.66500     10000
   macro avg    0.66631   0.66440   0.66409     10000
weighted avg    0.66612   0.66500   0.66429     10000

Macro-F1:  0.6640894513754211
MCC:  0.4981124658568643


03/29/2021 12:02:46 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.66143   0.70830   0.68406      3387
           1    0.67867   0.62748   0.65207      3275
           2    0.64963   0.65099   0.65031      3338

    accuracy                        0.66270     10000
   macro avg    0.66324   0.66226   0.66215     10000
weighted avg    0.66313   0.66270   0.66232     10000

Macro-F1:  0.662145880546711
MCC:  0.494235155012506


03/29/2021 12:02:55 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65741   0.71332   0.68423      3387
           1    0.68084   0.63053   0.65472      3275
           2    0.65492   0.64590   0.65038      3338

    accuracy                        0.66370     10000
   macro avg    0.66439   0.66325   0.66311     10000
weighted avg    0.66426   0.66370   0.66327     10000

Macro-F1:  0.6631088885308506
MCC:  0.49581494503189133


03/29/2021 12:03:05 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64920   0.73871   0.69106      3387
           1    0.67624   0.63969   0.65746      3275
           2    0.67552   0.61684   0.64485      3338

    accuracy                        0.66560     10000
   macro avg    0.66699   0.66508   0.66446     10000
weighted avg    0.66684   0.66560   0.66463     10000

Macro-F1:  0.6644580131921367
MCC:  0.4993484003137682


03/29/2021 12:03:14 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.64667   0.74786   0.69359      3387
           1    0.68099   0.62901   0.65397      3275
           2    0.67528   0.61863   0.64572      3338

    accuracy                        0.66580     10000
   macro avg    0.66765   0.66517   0.66443     10000
weighted avg    0.66746   0.66580   0.66463     10000

Macro-F1:  0.6644256261799607
MCC:  0.4999248763961842


03/29/2021 12:03:23 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65111   0.73723   0.69150      3387
           1    0.68327   0.62840   0.65468      3275
           2    0.66698   0.63002   0.64797      3338

    accuracy                        0.66580     10000
   macro avg    0.66712   0.66522   0.66472     10000
weighted avg    0.66694   0.66580   0.66491     10000

Macro-F1:  0.6647188623603116
MCC:  0.49950475934736216


03/29/2021 12:03:33 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.66061   0.70800   0.68348      3387
           1    0.68509   0.62840   0.65552      3275
           2    0.64706   0.65249   0.64976      3338

    accuracy                        0.66340     10000
   macro avg    0.66425   0.66296   0.66292     10000
weighted avg    0.66410   0.66340   0.66307     10000

Macro-F1:  0.6629209003648374
MCC:  0.4953285108579747


03/29/2021 12:03:43 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65213   0.73280   0.69012      3387
           1    0.68710   0.62290   0.65343      3275
           2    0.66078   0.63841   0.64940      3338

    accuracy                        0.66530     10000
   macro avg    0.66667   0.66470   0.66431     10000
weighted avg    0.66647   0.66530   0.66451     10000

Macro-F1:  0.6643136070850221
MCC:  0.4986595970344367


03/29/2021 12:03:53 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65806   0.72276   0.68890      3387
           1    0.67822   0.63908   0.65807      3275
           2    0.66343   0.63481   0.64881      3338

    accuracy                        0.66600     10000
   macro avg    0.66657   0.66555   0.66526     10000
weighted avg    0.66646   0.66600   0.66542     10000

Macro-F1:  0.6652589260507734
MCC:  0.49936816690680474


03/29/2021 12:04:03 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65109   0.73221   0.68927      3387
           1    0.67221   0.64183   0.65667      3275
           2    0.67657   0.62103   0.64761      3338

    accuracy                        0.66550     10000
   macro avg    0.66662   0.66502   0.66452     10000
weighted avg    0.66651   0.66550   0.66469     10000

Macro-F1:  0.6645172434012742
MCC:  0.4990081835542608


03/29/2021 12:04:13 - INFO - __main__ - ***** Evaluation Interval Hit *****


              precision    recall  f1-score   support

           0    0.65040   0.74373   0.69394      3387
           1    0.68909   0.61924   0.65230      3275
           2    0.66740   0.63661   0.65164      3338

    accuracy                        0.66720     10000
   macro avg    0.66896   0.66652   0.66596     10000
weighted avg    0.66875   0.66720   0.66618     10000

Macro-F1:  0.6659599232764122
MCC:  0.5018146685066798


#### Evaluations with frequency-matched scrambling

In [None]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
corrupt_method = "matched"
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
corrupt_datasets = get_dataset(f"../data-files/{data_file_name}-corrupted-{corrupt_method}")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))

corrupt_validation_input_features = []
corrupt_validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(corrupt_datasets["validation"])):
    if sentence2_key is None:
        bow_feature = torch.zeros(len(original_vocab))
        if sanity_check_non_empty(example[sentence1_key]):
            sentence_combined = example[sentence1_key]
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        if ex_index % 50000 == 0:
            print("Example sentence: " + sentence_combined)
        for t in sentence_tokens:
            bow_feature[original_vocab[t]] += 1
    else:
        bow_feature_1 = torch.zeros(len(original_vocab))
        bow_feature_2 = torch.zeros(len(original_vocab))
        s1 = ""
        s2 = ""
        if sanity_check_non_empty(example[sentence1_key]):
            s1 = example[sentence1_key]
        if sanity_check_non_empty(example[sentence2_key]):
            s2 = example[sentence2_key]
        s1_tokens = modified_basic_tokenizer.tokenize(s1)
        s2_tokens = modified_basic_tokenizer.tokenize(s2)
        if ex_index % 50000 == 0:
            print("Example sentence 1: " + s1)
            print("Example sentence 2: " + s2)
        for t in s1_tokens:
            bow_feature_1[original_vocab[t]] += 1
        for t in s2_tokens:
            bow_feature_2[original_vocab[t]] += 1
        bow_feature = torch.cat([bow_feature_1, bow_feature_2], dim=-1)
    corrupt_validation_input_features.append(bow_feature)
    corrupt_validation_label_ids.append(example["label"])
    
corrupt_validation_input_features = torch.stack(corrupt_validation_input_features, dim=0)
corrupt_validation_input_features = torch.tensor(corrupt_validation_input_features, dtype=torch.float)
corrupt_validation_label_ids = torch.tensor(corrupt_validation_label_ids, dtype=torch.long)
corrupt_validation_data = TensorDataset(corrupt_validation_input_features, corrupt_validation_label_ids)
corrupt_validation_dataloader = DataLoader(corrupt_validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

logger.info("***** Evaluation With Corrupt Data *****")
model.eval()
all_logits = []
all_label_ids = []
with torch.no_grad():
    # pbar = tqdm(validation_dataloader, desc="Iteration")
    for step, batch in enumerate(corrupt_validation_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, logits = model(input_features, labels=label_ids)
        logits = F.softmax(logits, dim=-1)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        all_logits.append(outputs)
        all_label_ids.append(label_ids)

all_logits = np.concatenate(all_logits, axis=0)
all_label_ids = np.concatenate(all_label_ids, axis=0)
result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
print(classification_report(all_label_ids, all_logits, digits=5))
print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])

#### Evaluations with frequency-unmatched scrambling

In [None]:
# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
corrupt_method = "mismatched"
data_file_name = task_name if task_name != "sst3" else "sst-tenary"
corrupt_datasets = get_dataset(f"../data-files/{data_file_name}-corrupted-{corrupt_method}")
logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))

corrupt_validation_input_features = []
corrupt_validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(corrupt_datasets["validation"])):
    if sentence2_key is None:
        bow_feature = torch.zeros(len(original_vocab))
        if sanity_check_non_empty(example[sentence1_key]):
            sentence_combined = example[sentence1_key]
        sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
        if ex_index % 50000 == 0:
            print("Example sentence: " + sentence_combined)
        for t in sentence_tokens:
            bow_feature[original_vocab[t]] += 1
    else:
        bow_feature_1 = torch.zeros(len(original_vocab))
        bow_feature_2 = torch.zeros(len(original_vocab))
        s1 = ""
        s2 = ""
        if sanity_check_non_empty(example[sentence1_key]):
            s1 = example[sentence1_key]
        if sanity_check_non_empty(example[sentence2_key]):
            s2 = example[sentence2_key]
        s1_tokens = modified_basic_tokenizer.tokenize(s1)
        s2_tokens = modified_basic_tokenizer.tokenize(s2)
        if ex_index % 50000 == 0:
            print("Example sentence 1: " + s1)
            print("Example sentence 2: " + s2)
        for t in s1_tokens:
            bow_feature_1[original_vocab[t]] += 1
        for t in s2_tokens:
            bow_feature_2[original_vocab[t]] += 1
        bow_feature = torch.cat([bow_feature_1, bow_feature_2], dim=-1)
    corrupt_validation_input_features.append(bow_feature)
    corrupt_validation_label_ids.append(example["label"])
    
corrupt_validation_input_features = torch.stack(corrupt_validation_input_features, dim=0)
corrupt_validation_input_features = torch.tensor(corrupt_validation_input_features, dtype=torch.float)
corrupt_validation_label_ids = torch.tensor(corrupt_validation_label_ids, dtype=torch.long)
corrupt_validation_data = TensorDataset(corrupt_validation_input_features, corrupt_validation_label_ids)
corrupt_validation_dataloader = DataLoader(corrupt_validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

logger.info("***** Evaluation With Corrupt Data *****")
model.eval()
all_logits = []
all_label_ids = []
with torch.no_grad():
    # pbar = tqdm(validation_dataloader, desc="Iteration")
    for step, batch in enumerate(corrupt_validation_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, logits = model(input_features, labels=label_ids)
        logits = F.softmax(logits, dim=-1)
        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        all_logits.append(outputs)
        all_label_ids.append(label_ids)

all_logits = np.concatenate(all_logits, axis=0)
all_label_ids = np.concatenate(all_label_ids, axis=0)
result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
print(classification_report(all_label_ids, all_logits, digits=5))
print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])

#### Random guessing baseline
If we randomly guess the lables, what is the performance now?

In [None]:
# getting avg mF1 on the dataset with a dummy classifier
import numpy as np
from sklearn.dummy import DummyClassifier

mf1s = []
mccs = []
runs = 100
for i in range(runs):
    dummy_clf = DummyClassifier(strategy="stratified")
    dummy_clf.fit(validation_input_features, validation_label_ids)
    dummy_labels = dummy_clf.predict(validation_input_features)

    # dummy performance
    # print(classification_report(validation_label_ids, dummy_labels, digits=5))
    result_to_save = classification_report(validation_label_ids, dummy_labels, digits=5, output_dict=True)
    mf1s += [result_to_save["macro avg"]["f1-score"]]
    mcc = matthews_corrcoef(validation_label_ids, dummy_labels)
    mccs += [mcc]

print(classification_report(validation_label_ids, dummy_labels, digits=5))
print(f"AVG over {runs} runs mF1: {round(sum(mf1s)/len(mf1s), 6)}.")
print("Standard Deviation of sample is % s " % (statistics.stdev(mf1s)))
print(f"AVG over {runs} runs MCC: {round(sum(mccs)/len(mccs), 6)}.")


#### FrequencyBoW classifiers

In [None]:
# task setups
task_name = "sst3"
num_labels = 3
FILENAME_CONFIG = {
    "sst3" : "sst-tenary"
}

# let us corrupt SST3 in the same way as before
train_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                    f"{FILENAME_CONFIG[task_name]}-train.tsv"), 
                       delimiter="\t")
eval_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                   f"{FILENAME_CONFIG[task_name]}-dev.tsv"), 
                      delimiter="\t")
test_df = pd.read_csv(os.path.join(external_output_dirname, FILENAME_CONFIG[task_name], 
                                   f"{FILENAME_CONFIG[task_name]}-test.tsv"), 
                      delimiter="\t")

train_df = Dataset.from_pandas(train_df)
eval_df = Dataset.from_pandas(eval_df)
test_df = Dataset.from_pandas(test_df)

In [None]:
modified_basic_tokenizer = ModifiedBasicTokenizer()
label_vocab_map = {}
token_frequency_map = {} # overwrite this everytime for a new dataset
for i, example in enumerate(train_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
for i, example in enumerate(eval_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
for i, example in enumerate(test_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        if label not in label_vocab_map.keys():
            label_vocab_map[label] = tokens
        else:
            for t in tokens:
                label_vocab_map[label].append(t)
        for t in tokens:
            if t in token_frequency_map.keys():
                token_frequency_map[t] = token_frequency_map[t] + 1
            else:
                token_frequency_map[t] = 1
task_token_frequency_map = sorted(token_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
task_token_frequency_map = OrderedDict(task_token_frequency_map)

training BoW with 1st order frequency bins

In [None]:
# freq and bucket mappings
freq_set = set([])
for k, v in task_token_frequency_map.items():
    freq_set.add(v)
freq_set = list(freq_set)
freq_set.sort()
bucket_count = 256
freq_bucket = np.logspace(math.log(freq_set[0], 10), math.log(freq_set[-1], 10), bucket_count, endpoint=True)
freq_bucket = freq_bucket[:-1]
freq_bucket = [math.ceil(n) for n in freq_bucket]
# finally the bucket is a map between freq and bucket number
def find_bucket_number(freq, freq_bucket):
    for i in range(len(freq_bucket)):
        if freq > freq_bucket[i]:
            continue
        else:
            return i+1
    return len(freq_bucket)

new_bucket_idx = 0
freq_bucket_map = {}
for freq in freq_set:
    # bucket_num = find_bucket_number(freq, freq_bucket)
    freq_bucket_map[freq] = new_bucket_idx
    new_bucket_idx += 1

bucket_length = new_bucket_idx # len(freq_bucket)

In [None]:
# these lines of code make random buckets and assign words to them.
freq_count = {}
vocab = []
for k, v in task_token_frequency_map.items():
    vocab.append(k)
    if v in freq_count.keys():
        freq_count[v] += 1
    else:
        freq_count[v] = 1
random.shuffle(vocab)
bucket_length = 600
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
bucket_vocab_random = split(vocab, bucket_length)
random_bucket_vocab_map = {}
bucket_id = 0
for bucket in bucket_vocab_random:
    for word in bucket:
        random_bucket_vocab_map[word] = bucket_id
    bucket_id += 1

In [None]:
# FBoW feature vectors for train split
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(train_df)):
    bow_feature = torch.zeros(bucket_length)
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    for t in sentence_tokens:
        # bow_feature[freq_bucket_map[token_frequency_map[t]]] = 1 # not bucket count, aggregated info contains word identity!
        bow_feature[random_bucket_vocab_map[t]] = +1
    if ex_index % 50000 == 0:
        print("Example sentence: " + sentence_combined)
        print(bow_feature)
    train_input_features.append(bow_feature)
    train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

In [None]:
# FBoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(eval_df)):
    bow_feature = torch.zeros(bucket_length)
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    for t in sentence_tokens:
        # bow_feature[freq_bucket_map[token_frequency_map[t]]] = 1 # bucket count
        bow_feature[random_bucket_vocab_map[t]] = +1
    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

In [None]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

In [None]:
# some overriding fun stuffs!
lr = 1e-3
num_train_epochs = 20
model = BOWClassifier(len(validation_label_ids.unique()), bucket_length)
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

In [None]:
global_step = 0
max_score = -1
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            # logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            # print(classification_report(all_label_ids, all_logits, digits=5))
            print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])
            if result_to_save["macro avg"]["f1-score"] > max_score:
                max_score = result_to_save["macro avg"]["f1-score"]
                    
        global_step += 1
print("Best Macro-F1: ", max_score)

training BoW with 1st and 2nd order frequency bins

In [None]:
# repartition the first order information
second_order_freq_set = set([])
for k, v in task_token_frequency_map.items():
    second_order_freq_set.add(v)
second_order_freq_set = list(second_order_freq_set)
second_order_freq_set.sort()
temp_bucket_count = 24
second_order_freq_bucket = np.logspace(math.log(second_order_freq_set[0], 10), 
                          math.log(second_order_freq_set[-1], 10), temp_bucket_count+1, 
                          endpoint=True)
second_order_freq_bucket = second_order_freq_bucket[:-1]
second_order_freq_bucket = [math.ceil(n) for n in second_order_freq_bucket]
# finally the bucket is a map between freq and bucket number
def find_bucket_number(freq, freq_bucket):
    for i in range(len(freq_bucket)):
        if freq > freq_bucket[i]:
            continue
        else:
            return i+1
    return len(freq_bucket)

second_order_freq_bucket_map = {}
for freq in second_order_freq_set:
    bucket_num = find_bucket_number(freq, second_order_freq_bucket)
    second_order_freq_bucket_map[freq] = bucket_num

In [None]:
modified_basic_tokenizer = ModifiedBasicTokenizer()
token_freq_freq_map = {} # overwrite this everytime for a new dataset
for i, example in enumerate(train_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
for i, example in enumerate(eval_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
for i, example in enumerate(test_df):
    if i % 10000 == 0 and i != 0:
        print(f"processing #{i} example...")
    original_sentence = example['text']
    label = example['label']
    if len(original_sentence.strip()) != 0:
        tokens = modified_basic_tokenizer.tokenize(original_sentence)
        for i in range(len(tokens)-1):
            for j in range(i+1, len(tokens)):
                t1 = tokens[i]
                t2 = tokens[j]
                index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], 
                               second_order_freq_bucket_map[token_frequency_map[t2]]]
                index_tuple.sort()
                index_tuple = tuple(index_tuple)
                if index_tuple in token_freq_freq_map.keys():
                    token_freq_freq_map[index_tuple] += 1
                else:
                    token_freq_freq_map[index_tuple] = 1
                    
task_token_freq_freq_map = sorted(token_freq_freq_map.items(), key=operator.itemgetter(1), reverse=True)
task_token_freq_freq_map = OrderedDict(task_token_freq_freq_map)

In [None]:
# repartition the first order information
second_order_freq_freq_set = set([])
for k, v in task_token_freq_freq_map.items():
    second_order_freq_freq_set.add(v)
second_order_freq_freq_set = list(second_order_freq_freq_set)
second_order_freq_freq_set.sort()
# second_order_freq_freq_set = second_order_freq_freq_set[::-1]
# bucket_count = 48
# second_order_freq_freq_bucket = np.logspace(0, 
#                           math.log(len(second_order_freq_freq_set), 10), bucket_count, 
#                           endpoint=True)
# second_order_freq_freq_bucket = second_order_freq_freq_bucket[:-1]
# second_order_freq_freq_bucket = [math.ceil(n) for n in second_order_freq_freq_bucket]
# for i in range(1, len(second_order_freq_freq_bucket)):
#     if second_order_freq_freq_bucket[i] == second_order_freq_freq_bucket[i-1]:
#         second_order_freq_freq_bucket[i] += 1
# second_order_freq_freq_bucket += [len(second_order_freq_freq_set)]
# start = 0
# bucket_count = 0
# second_order_freq_freq_bucket_map = {}
# for i in range(len(second_order_freq_freq_bucket)):
#     end = second_order_freq_freq_bucket[i]
#     bucket_freqs = second_order_freq_freq_set[start:second_order_freq_freq_bucket[i]]
#     for freq in bucket_freqs:
#         second_order_freq_freq_bucket_map[freq] = bucket_count+1
#     bucket_count += 1
#     start = second_order_freq_freq_bucket[i]
second_order_freq_freq_bucket_map = {}
new_bucket_idx = 0
freq_bucket_map = {}
for freq in second_order_freq_freq_set:
    # bucket_num = find_bucket_number(freq, freq_bucket)
    second_order_freq_freq_bucket_map[freq] = new_bucket_idx
    new_bucket_idx += 1

bucket_length = new_bucket_idx # len(freq_bucket)
# the code above create second order buckets, now we can create second order BoW vectors!

In [None]:
# FBoW feature vectors for train split (2nd order = 1st order concat with 2nd order)
train_input_features = []
train_label_ids = []
for (ex_index, example) in enumerate(tqdm(train_df)):
    bow_feature = torch.zeros(bucket_length) # up-to 2nd feature map
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    # first order here!
#     for t in sentence_tokens:
#         bow_feature[freq_bucket_map[token_frequency_map[t]]-1] += 1 # bucket count
    # awesome :) second order here!
    for i in range(len(sentence_tokens)-1):
        for j in range(i+1, len(sentence_tokens)):
            t1 = sentence_tokens[i]
            t2 = sentence_tokens[j]
            index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], second_order_freq_bucket_map[token_frequency_map[t2]]]
            index_tuple.sort()
            index_tuple = tuple(index_tuple)
            second_order_bucket = second_order_freq_freq_bucket_map[task_token_freq_freq_map[index_tuple]]
            bow_feature[second_order_bucket] += 1 # bucket count

    if ex_index % 50000 == 0:
        print("Example sentence: " + sentence_combined)
        print(bow_feature)
    train_input_features.append(bow_feature)
    train_label_ids.append(example["label"])
    
train_input_features = torch.stack(train_input_features, dim=0)
train_input_features = torch.tensor(train_input_features, dtype=torch.float)
train_label_ids = torch.tensor(train_label_ids, dtype=torch.long)
train_data = TensorDataset(train_input_features, train_label_ids)

In [None]:
# FBoW feature vectors for validation split
validation_input_features = []
validation_label_ids = []
for (ex_index, example) in enumerate(tqdm(eval_df)):
    bow_feature = torch.zeros(bucket_length) # up-to 2nd feature map
    if sentence2_key is None:
        sentence_combined = example[sentence1_key]
    else:
        sentence_combined = example[sentence1_key] + " [SEP] " + example[sentence2_key]
    sentence_tokens = modified_basic_tokenizer.tokenize(sentence_combined)
    sentence_tokens = sentence_tokens[:max_length]
    # first order here!
#     for t in sentence_tokens:
#         bow_feature[freq_bucket_map[token_frequency_map[t]]] += 1 # bucket count
    # awesome :) second order here!
    for i in range(len(sentence_tokens)-1):
        for j in range(i+1, len(sentence_tokens)):
            t1 = sentence_tokens[i]
            t2 = sentence_tokens[j]
            index_tuple = [second_order_freq_bucket_map[token_frequency_map[t1]], second_order_freq_bucket_map[token_frequency_map[t2]]]
            index_tuple.sort()
            index_tuple = tuple(index_tuple)
            second_order_bucket = second_order_freq_freq_bucket_map[task_token_freq_freq_map[index_tuple]]
            bow_feature[second_order_bucket] += 1 # bucket count

    validation_input_features.append(bow_feature)
    validation_label_ids.append(example["label"])

validation_input_features = torch.stack(validation_input_features, dim=0)
validation_input_features = torch.tensor(validation_input_features, dtype=torch.float)
validation_label_ids = torch.tensor(validation_label_ids, dtype=torch.long)
validation_data = TensorDataset(validation_input_features, validation_label_ids)

In [None]:
# data loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=per_device_train_batch_size*n_gpu)
validation_dataloader = DataLoader(validation_data, batch_size=per_device_eval_batch_size*n_gpu, shuffle=False)

In [None]:
# restart the model
model = BOWClassifier(len(validation_label_ids.unique()), 
                      bucket_length)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
if n_gpu > 0 and not no_cuda:
    model = torch.nn.DataParallel(model)

In [None]:
global_step = 0
num_train_epochs = 20
max_score = -1
for _ in range(int(num_train_epochs)):
    
    model.train()
    # pbar = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(train_dataloader):
        if torch.cuda.is_available() and not no_cuda:
            torch.cuda.empty_cache()

        input_features, label_ids = batch

        if torch.cuda.is_available() and not no_cuda:
            input_features = input_features.to(device)
            label_ids = label_ids.to(device)

        loss, _ = model(input_features, labels=label_ids)

        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        loss.backward()

        optimizer.step()
        model.zero_grad()
        # pbar.set_postfix({'train_loss': loss.tolist()})

        if global_step % 500 == 0:
            # logger.info("***** Evaluation Interval Hit *****")
            model.eval()
            all_logits = []
            all_label_ids = []
            with torch.no_grad():
                # pbar = tqdm(validation_dataloader, desc="Iteration")
                for step, batch in enumerate(validation_dataloader):
                    if torch.cuda.is_available() and not no_cuda:
                        torch.cuda.empty_cache()
                        
                    input_features, label_ids = batch
                    
                    if torch.cuda.is_available() and not no_cuda:
                        input_features = input_features.to(device)
                        label_ids = label_ids.to(device)
                    
                    loss, logits = model(input_features, labels=label_ids)
                    logits = F.softmax(logits, dim=-1)
                    logits = logits.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    outputs = np.argmax(logits, axis=1)
                    all_logits.append(outputs)
                    all_label_ids.append(label_ids)
                    
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            result_to_save = classification_report(all_label_ids, all_logits, digits=5, output_dict=True)
            # print(classification_report(all_label_ids, all_logits, digits=5))
            print("Macro-F1: ", result_to_save["macro avg"]["f1-score"])
            if result_to_save["macro avg"]["f1-score"] > max_score:
                max_score = result_to_save["macro avg"]["f1-score"]
                    
        global_step += 1
print("Best Macro-F1: ", max_score)