In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
import torchmetrics
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

import collections
import pandas as pd
import json
from tqdm.auto import tqdm, trange

from datasets import load_metric
import datasets
from transformers import AutoConfig, AutoTokenizer, BertModel, RobertaModel
from transformers import BertForSequenceClassification
from transformers import TrainingArguments, Trainer

import matplotlib.pyplot as plt


In [2]:
import sys
import os
sys.path.append(f'{os.getcwd()}/SentEval')
PATH_TO_DATA = f'{os.getcwd()}/SentEval/data'

# Import SentEval
import senteval

In [3]:
# https://github.com/huggingface/transformers/issues/5486
# os.environ["TOKENIZERS_PARALLELISM"] = "false-"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
with open('tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [6]:
base_model = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [7]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    def __init__(self, tsv_file):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        self.encodings = tokenizer(self.df['text'].tolist(), truncation=True, padding=True, max_length=128)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
        self.labels = self.df['label'].tolist()
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item


In [8]:
pearsonr = load_metric("pearsonr")
spearmanr = load_metric("spearmanr")

In [9]:
# GPU memory usage: 6617 - 6680mb with bs 32
# bs 64 gives OOM
# bs 48 GPU memory 7894
batch_size = 32

In [None]:
def single_train(task):
    torch.cuda.empty_cache()
    model = None
    trainer = None 
    
    num_labels = tasks[task]
    train_dataset = MyDataset(f'./processed/train/{task}.tsv')
    test_dataset = MyDataset(f'./processed/test/{task}.tsv')
    valid_dataset = MyDataset(f'./processed/dev/{task}.tsv')
    
    
    singletaskbert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) 
    training_args = TrainingArguments(
        output_dir=f'./results/baselines/{task}',          # output directory
        num_train_epochs=5,              # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir=f'./results/baselines/{task}/logs',            # directory for storing logs
        load_best_model_at_end=False,     # load the best model when finished training (default metric is loss)
        # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_first_step = True, 
#         logging_steps=500,               # log & save weights each logging_steps
#         save_steps=500,
        evaluation_strategy="epoch",     # evaluate each `logging_steps`
        save_strategy='epoch'
    )
    from sklearn.metrics import precision_recall_fscore_support
    
    if num_labels == 1:
        def compute_metrics(pred):
            predictions, labels = pred
            rmse = mean_squared_error(labels, predictions, squared=False)
            return {"rmse": rmse}
    elif num_labels == 2:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
    else:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }

    
    trainer = Trainer(
        model=singletaskbert,   # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
#         test_dataset=test_dataset,            # test dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    trainer.train()

In [10]:
for task in tasks:
    if os.path.isdir(f'./results/baselines/{task}'):
        print(f'skip {task}')
        continue
    torch.cuda.empty_cache()
    model = None
    trainer = None 
    
    num_labels = tasks[task]
    train_dataset = MyDataset(f'./processed/train/{task}.tsv')
    test_dataset = MyDataset(f'./processed/test/{task}.tsv')
    valid_dataset = MyDataset(f'./processed/dev/{task}.tsv')
    
    
    singletaskbert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels) 
    training_args = TrainingArguments(
        output_dir=f'./results/baselines/{task}',          # output directory
        num_train_epochs=5,              # total number of training epochs
        per_device_train_batch_size=batch_size,  # batch size per device during training
        per_device_eval_batch_size=batch_size,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir=f'./results/baselines/{task}/logs',            # directory for storing logs
        load_best_model_at_end=False,     # load the best model when finished training (default metric is loss)
        # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_first_step = True, 
#         logging_steps=500,               # log & save weights each logging_steps
#         save_steps=500,
        evaluation_strategy="epoch",     # evaluate each `logging_steps`
        save_strategy='epoch'
    )
    from sklearn.metrics import precision_recall_fscore_support
    
    if num_labels == 1:
        def compute_metrics(pred):
            predictions, labels = pred
            rmse = mean_squared_error(labels, predictions, squared=False)
            return {"rmse": rmse}
    elif num_labels == 2:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }
    else:
        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
            acc = accuracy_score(labels, preds)
            return {
                'accuracy': acc,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }

    
    trainer = Trainer(
        model=singletaskbert,   # the instantiated Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
#         test_dataset=test_dataset,            # test dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    )
    trainer.train()

skip CrowdFlower


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3942,0.274151,0.897385,0.346008,0.387485,0.334119
2,0.3188,0.267936,0.900483,0.393077,0.477817,0.369968
3,0.2476,0.305294,0.88933,0.419818,0.548605,0.415887
4,0.178,0.355949,0.88871,0.422777,0.545004,0.415109
5,0.1261,0.417322,0.891436,0.408869,0.473858,0.391538


***** Running Evaluation *****
  Num examples = 8069
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/DailyDialog/checkpoint-2725
Configuration saved in ./results/baselines/DailyDialog/checkpoint-2725/config.json
Model weights saved in ./results/baselines/DailyDialog/checkpoint-2725/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8069
  Batch size = 32
Saving model checkpoint to ./results/baselines/DailyDialog/checkpoint-5450
Configuration saved in ./results/baselines/DailyDialog/checkpoint-5450/config.json
Model weights saved in ./results/baselines/DailyDialog/checkpoint-5450/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8069
  Batch size = 32
Saving model checkpoint to ./results/baselines/DailyDialog/checkpoint-8175
Configuration saved in ./results/baselines/DailyDialog/checkpoint-8175/config.json
Model weights saved in ./results/baselines/DailyDialog/checkpoint-8175/pytorch_m

Epoch,Training Loss,Validation Loss,Rmse
1,0.537,0.006395,0.079971
2,0.0375,0.00923,0.096074
3,0.0375,0.005136,0.071664
4,0.004,0.005653,0.075187
5,0.004,0.005192,0.072059


***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Valence/checkpoint-276
Configuration saved in ./results/baselines/EmoBank_Valence/checkpoint-276/config.json
Model weights saved in ./results/baselines/EmoBank_Valence/checkpoint-276/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Valence/checkpoint-552
Configuration saved in ./results/baselines/EmoBank_Valence/checkpoint-552/config.json
Model weights saved in ./results/baselines/EmoBank_Valence/checkpoint-552/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Valence/checkpoint-828
Configuration saved in ./results/baselines/EmoBank_Valence/checkpoint-828/config.json
Model weights saved in ./results/baselines/EmoBank_Valence/checkpoint-828/pytorch_model.bin
***** Running Evaluati

Epoch,Training Loss,Validation Loss,Rmse
1,0.4965,0.007932,0.089059
2,0.0367,0.009329,0.096588
3,0.0367,0.008979,0.09476
4,0.006,0.00737,0.085847
5,0.006,0.007538,0.086821


***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Arousal/checkpoint-276
Configuration saved in ./results/baselines/EmoBank_Arousal/checkpoint-276/config.json
Model weights saved in ./results/baselines/EmoBank_Arousal/checkpoint-276/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Arousal/checkpoint-552
Configuration saved in ./results/baselines/EmoBank_Arousal/checkpoint-552/config.json
Model weights saved in ./results/baselines/EmoBank_Arousal/checkpoint-552/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Arousal/checkpoint-828
Configuration saved in ./results/baselines/EmoBank_Arousal/checkpoint-828/config.json
Model weights saved in ./results/baselines/EmoBank_Arousal/checkpoint-828/pytorch_model.bin
***** Running Evaluati

Epoch,Training Loss,Validation Loss,Rmse
1,0.5385,0.007563,0.086963
2,0.0407,0.007075,0.084112
3,0.0407,0.006574,0.081078
4,0.0061,0.006338,0.079614
5,0.0061,0.00679,0.082399


***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Dominance/checkpoint-276
Configuration saved in ./results/baselines/EmoBank_Dominance/checkpoint-276/config.json
Model weights saved in ./results/baselines/EmoBank_Dominance/checkpoint-276/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Dominance/checkpoint-552
Configuration saved in ./results/baselines/EmoBank_Dominance/checkpoint-552/config.json
Model weights saved in ./results/baselines/EmoBank_Dominance/checkpoint-552/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 498
  Batch size = 32
Saving model checkpoint to ./results/baselines/EmoBank_Dominance/checkpoint-828
Configuration saved in ./results/baselines/EmoBank_Dominance/checkpoint-828/config.json
Model weights saved in ./results/baselines/EmoBank_Dominance/checkpoint-828/pytorch_model.bin
****

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3987,0.210191,0.929656,0.839019,0.827381,0.853288
2,0.2104,0.134268,0.963229,0.904348,0.931965,0.883646
3,0.0661,0.137341,0.973621,0.936561,0.958109,0.918334
4,0.0274,0.127036,0.976819,0.943459,0.960929,0.928807
5,0.017,0.140868,0.97522,0.941495,0.960122,0.925647


***** Running Evaluation *****
  Num examples = 1251
  Batch size = 32
Saving model checkpoint to ./results/baselines/HateOffensive/checkpoint-694
Configuration saved in ./results/baselines/HateOffensive/checkpoint-694/config.json
Model weights saved in ./results/baselines/HateOffensive/checkpoint-694/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1251
  Batch size = 32
Saving model checkpoint to ./results/baselines/HateOffensive/checkpoint-1388
Configuration saved in ./results/baselines/HateOffensive/checkpoint-1388/config.json
Model weights saved in ./results/baselines/HateOffensive/checkpoint-1388/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1251
  Batch size = 32
Saving model checkpoint to ./results/baselines/HateOffensive/checkpoint-2082
Configuration saved in ./results/baselines/HateOffensive/checkpoint-2082/config.json
Model weights saved in ./results/baselines/HateOffensive/checkpoint-2082/pytorch_model.bin
***** Running Evaluation *****


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.3429,1.31776,0.45343,0.174347,0.265279,0.179136
2,1.2266,1.298108,0.476534,0.204737,0.275368,0.200825
3,0.9148,1.441295,0.478219,0.220259,0.25403,0.215277
4,0.529,1.768355,0.467148,0.245341,0.363404,0.235721
5,0.3095,2.183343,0.459928,0.24597,0.298616,0.239049


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_age/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_age/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_age/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_age/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_age/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_age/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_age/checkpoint-3117
Configuration saved in ./results/baselines/PASTEL_age/checkp

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0994,0.100069,0.976403,0.98806,0.976403,1.0
2,0.0787,0.108778,0.976643,0.988181,0.976638,1.0
3,0.05,0.113277,0.974717,0.987184,0.977284,0.997287
4,0.0206,0.164836,0.974476,0.987051,0.977971,0.996301
5,0.0104,0.182554,0.974476,0.987048,0.978203,0.996054


***** Running Evaluation *****
  Num examples = 4153
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_country/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_country/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_country/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4153
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_country/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_country/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_country/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4153
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_country/checkpoint-3117
Configuration saved in ./results/baselines/PASTEL_country/checkpoint-3117/config.json
Model weights saved in ./results/baselines/PASTEL_country/checkpoint-3117/pytorch_model.bin
***** Running Evalu

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.5241,1.48133,0.436101,0.156322,0.286494,0.178683
2,1.4065,1.454466,0.454392,0.242987,0.340456,0.231095
3,1.1238,1.546608,0.440193,0.257852,0.292032,0.250045
4,0.7169,1.866608,0.428881,0.252176,0.285587,0.245995
5,0.4337,2.177496,0.422383,0.26309,0.287689,0.255597


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_education/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_education/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_education/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_education/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_education/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_education/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_education/checkpoint-3117
Configuration save

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6833,0.687885,0.834416,0.20546,0.25393,0.205142
2,0.607,0.676828,0.838026,0.244631,0.356683,0.227845
3,0.4328,0.776087,0.833694,0.251103,0.347318,0.234171
4,0.2389,0.952446,0.804091,0.258753,0.290963,0.245538
5,0.1272,1.15559,0.803851,0.259601,0.289519,0.245106


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_ethnic/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_ethnic/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_ethnic/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_ethnic/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_ethnic/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_ethnic/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_ethnic/checkpoint-3117
Configuration saved in ./results/baseli

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5756,0.559227,0.748496,0.465729,0.526825,0.462577
2,0.5238,0.583513,0.748977,0.46923,0.519745,0.46544
3,0.3722,0.672592,0.737906,0.473761,0.48846,0.470388
4,0.2124,0.927811,0.735259,0.495636,0.54049,0.484149
5,0.1281,1.232693,0.732611,0.490561,0.505484,0.483999


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_gender/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_gender/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_gender/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_gender/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_gender/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_gender/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_gender/checkpoint-3117
Configuration saved in ./results/baselines/PASTEL_gender/checkpoint-3117/config.json
Model wei

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9532,0.937625,0.50373,0.362738,0.33644,0.398192
2,0.8875,0.949823,0.506619,0.428365,0.493617,0.434389
3,0.6466,1.080967,0.523706,0.470997,0.4844,0.466896
4,0.3609,1.445178,0.511913,0.463791,0.465752,0.462475
5,0.2135,1.95275,0.516245,0.469561,0.476377,0.465928


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_politics/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_politics/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_politics/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_politics/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_politics/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_politics/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_politics/checkpoint-3117
Configuration saved in ./results/baselines/PASTEL_politics/checkpoint-3117/config.json
Model weights saved in ./results/baselines/PASTE

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2896,1.299677,0.468111,0.26657,0.400161,0.292322
2,1.2461,1.280031,0.472202,0.272989,0.41688,0.296377
3,1.0514,1.370571,0.45367,0.293912,0.35582,0.304157
4,0.7007,1.667322,0.422383,0.315304,0.330849,0.315826
5,0.4421,2.045131,0.435138,0.32034,0.335627,0.319113


***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_tod/checkpoint-1039
Configuration saved in ./results/baselines/PASTEL_tod/checkpoint-1039/config.json
Model weights saved in ./results/baselines/PASTEL_tod/checkpoint-1039/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./results/baselines/PASTEL_tod/checkpoint-2078
Configuration saved in ./results/baselines/PASTEL_tod/checkpoint-2078/config.json
Model weights saved in ./results/baselines/PASTEL_tod/checkpoint-2078/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4155
  Batch size = 32
Saving model checkpoint to ./results/baselines/PASTEL_tod/checkpoint-3117
Configuration saved in ./results/baselines/PASTEL_tod/checkpoint-3117/config.json
Model weights saved in ./results/

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5585,0.556077,0.710873,0.72343,0.692357,0.757422
2,0.4978,0.57084,0.719296,0.723252,0.712149,0.734707
3,0.3438,0.683438,0.708286,0.719824,0.691468,0.750604
4,0.2088,0.90029,0.702354,0.713553,0.686712,0.742578
5,0.128,1.238347,0.703112,0.711916,0.690427,0.734785


***** Running Evaluation *****
  Num examples = 51410
  Batch size = 32
Saving model checkpoint to ./results/baselines/SARC/checkpoint-6427
Configuration saved in ./results/baselines/SARC/checkpoint-6427/config.json
Model weights saved in ./results/baselines/SARC/checkpoint-6427/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 51410
  Batch size = 32
Saving model checkpoint to ./results/baselines/SARC/checkpoint-12854
Configuration saved in ./results/baselines/SARC/checkpoint-12854/config.json
Model weights saved in ./results/baselines/SARC/checkpoint-12854/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 51410
  Batch size = 32
Saving model checkpoint to ./results/baselines/SARC/checkpoint-19281
Configuration saved in ./results/baselines/SARC/checkpoint-19281/config.json
Model weights saved in ./results/baselines/SARC/checkpoint-19281/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 51410
  Batch size = 32
Saving model checkpoint to .

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0158,0.184673,0.977085,0.64,0.761905,0.551724
2,0.0111,0.18884,0.976448,0.633663,0.744186,0.551724
3,0.0057,0.210745,0.977085,0.64,0.761905,0.551724
4,0.0036,0.194174,0.977721,0.646465,0.780488,0.551724
5,0.0022,0.207722,0.977721,0.646465,0.780488,0.551724


***** Running Evaluation *****
  Num examples = 1571
  Batch size = 32
Saving model checkpoint to ./results/baselines/SarcasmGhosh/checkpoint-1244
Configuration saved in ./results/baselines/SarcasmGhosh/checkpoint-1244/config.json
Model weights saved in ./results/baselines/SarcasmGhosh/checkpoint-1244/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1571
  Batch size = 32
Saving model checkpoint to ./results/baselines/SarcasmGhosh/checkpoint-2488
Configuration saved in ./results/baselines/SarcasmGhosh/checkpoint-2488/config.json
Model weights saved in ./results/baselines/SarcasmGhosh/checkpoint-2488/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1571
  Batch size = 32
Saving model checkpoint to ./results/baselines/SarcasmGhosh/checkpoint-3732
Configuration saved in ./results/baselines/SarcasmGhosh/checkpoint-3732/config.json
Model weights saved in ./results/baselines/SarcasmGhosh/checkpoint-3732/pytorch_model.bin
***** Running Evaluation *****
  Num 

Epoch,Training Loss,Validation Loss,Rmse
1,0.0094,0.01982,0.140785
2,0.0073,0.015958,0.126324
3,0.0062,0.014956,0.122293
4,0.0053,0.013766,0.117327
5,0.0042,0.014728,0.12136


***** Running Evaluation *****
  Num examples = 1044
  Batch size = 32
Saving model checkpoint to ./results/baselines/SentiTreeBank/checkpoint-7378
Configuration saved in ./results/baselines/SentiTreeBank/checkpoint-7378/config.json
Model weights saved in ./results/baselines/SentiTreeBank/checkpoint-7378/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1044
  Batch size = 32
Saving model checkpoint to ./results/baselines/SentiTreeBank/checkpoint-14756
Configuration saved in ./results/baselines/SentiTreeBank/checkpoint-14756/config.json
Model weights saved in ./results/baselines/SentiTreeBank/checkpoint-14756/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1044
  Batch size = 32
Saving model checkpoint to ./results/baselines/SentiTreeBank/checkpoint-22134
Configuration saved in ./results/baselines/SentiTreeBank/checkpoint-22134/config.json
Model weights saved in ./results/baselines/SentiTreeBank/checkpoint-22134/pytorch_model.bin
***** Running Evaluati

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1656,0.143976,0.948563,0.948847,0.940358,0.95749
2,0.07,0.170564,0.958144,0.958562,0.945813,0.97166
3,0.0229,0.201493,0.963691,0.964,0.952569,0.975709
4,0.0071,0.254436,0.965204,0.965483,0.9545,0.976721
5,0.0016,0.241933,0.968734,0.969,0.95751,0.980769


***** Running Evaluation *****
  Num examples = 1983
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortHumor/checkpoint-1182
Configuration saved in ./results/baselines/ShortHumor/checkpoint-1182/config.json
Model weights saved in ./results/baselines/ShortHumor/checkpoint-1182/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1983
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortHumor/checkpoint-2364
Configuration saved in ./results/baselines/ShortHumor/checkpoint-2364/config.json
Model weights saved in ./results/baselines/ShortHumor/checkpoint-2364/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1983
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortHumor/checkpoint-3546
Configuration saved in ./results/baselines/ShortHumor/checkpoint-3546/config.json
Model weights saved in ./results/baselines/ShortHumor/checkpoint-3546/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1983
  

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0953,0.083775,0.97126,0.971138,0.973701,0.968589
2,0.059,0.090684,0.97659,0.976504,0.978554,0.974462
3,0.0269,0.099773,0.980544,0.980446,0.983784,0.977131
4,0.0136,0.096889,0.981166,0.980976,0.989321,0.972771
5,0.0067,0.11026,0.98201,0.981812,0.991114,0.972682


***** Running Evaluation *****
  Num examples = 22512
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortJokeKaggle/checkpoint-12709
Configuration saved in ./results/baselines/ShortJokeKaggle/checkpoint-12709/config.json
Model weights saved in ./results/baselines/ShortJokeKaggle/checkpoint-12709/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 22512
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortJokeKaggle/checkpoint-25418
Configuration saved in ./results/baselines/ShortJokeKaggle/checkpoint-25418/config.json
Model weights saved in ./results/baselines/ShortJokeKaggle/checkpoint-25418/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 22512
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortJokeKaggle/checkpoint-38127
Configuration saved in ./results/baselines/ShortJokeKaggle/checkpoint-38127/config.json
Model weights saved in ./results/baselines/ShortJokeKaggle/checkpoint-38127/pytorch_model.bi

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6868,0.426754,0.924528,0.918367,1.0,0.849057
2,0.6868,0.041883,0.990566,0.990476,1.0,0.981132
3,0.6868,0.001855,1.0,1.0,1.0,1.0
4,0.6868,0.006017,1.0,1.0,1.0,1.0
5,0.6868,0.005312,1.0,1.0,1.0,1.0


***** Running Evaluation *****
  Num examples = 106
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortRomance/checkpoint-60
Configuration saved in ./results/baselines/ShortRomance/checkpoint-60/config.json
Model weights saved in ./results/baselines/ShortRomance/checkpoint-60/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 106
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortRomance/checkpoint-120
Configuration saved in ./results/baselines/ShortRomance/checkpoint-120/config.json
Model weights saved in ./results/baselines/ShortRomance/checkpoint-120/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 106
  Batch size = 32
Saving model checkpoint to ./results/baselines/ShortRomance/checkpoint-180
Configuration saved in ./results/baselines/ShortRomance/checkpoint-180/config.json
Model weights saved in ./results/baselines/ShortRomance/checkpoint-180/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 106


Epoch,Training Loss,Validation Loss,Rmse
1,0.6871,0.008698,0.093265
2,0.0454,0.010219,0.101089
3,0.0454,0.008931,0.094507
4,0.0074,0.008753,0.093558
5,0.0045,0.008664,0.09308


***** Running Evaluation *****
  Num examples = 530
  Batch size = 32
Saving model checkpoint to ./results/baselines/StanfordPoliteness/checkpoint-309
Configuration saved in ./results/baselines/StanfordPoliteness/checkpoint-309/config.json
Model weights saved in ./results/baselines/StanfordPoliteness/checkpoint-309/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 530
  Batch size = 32
Saving model checkpoint to ./results/baselines/StanfordPoliteness/checkpoint-618
Configuration saved in ./results/baselines/StanfordPoliteness/checkpoint-618/config.json
Model weights saved in ./results/baselines/StanfordPoliteness/checkpoint-618/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 530
  Batch size = 32
Saving model checkpoint to ./results/baselines/StanfordPoliteness/checkpoint-927
Configuration saved in ./results/baselines/StanfordPoliteness/checkpoint-927/config.json
Model weights saved in ./results/baselines/StanfordPoliteness/checkpoint-927/pytorch_model

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7168,0.6663,0.548571,0.048193,1.0,0.024691
2,0.7168,0.618478,0.645714,0.630952,0.609195,0.654321
3,0.7168,0.576512,0.714286,0.671053,0.71831,0.62963
4,0.7168,0.819517,0.651429,0.590604,0.647059,0.54321
5,0.4782,0.777154,0.742857,0.676259,0.810345,0.580247


***** Running Evaluation *****
  Num examples = 175
  Batch size = 32
Saving model checkpoint to ./results/baselines/TroFi/checkpoint-105
Configuration saved in ./results/baselines/TroFi/checkpoint-105/config.json
Model weights saved in ./results/baselines/TroFi/checkpoint-105/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 175
  Batch size = 32
Saving model checkpoint to ./results/baselines/TroFi/checkpoint-210
Configuration saved in ./results/baselines/TroFi/checkpoint-210/config.json
Model weights saved in ./results/baselines/TroFi/checkpoint-210/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 175
  Batch size = 32
Saving model checkpoint to ./results/baselines/TroFi/checkpoint-315
Configuration saved in ./results/baselines/TroFi/checkpoint-315/config.json
Model weights saved in ./results/baselines/TroFi/checkpoint-315/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 175
  Batch size = 32
Saving model checkpoint to ./results/basel

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7017,0.40146,0.822955,0.650602,0.697674,0.609481
2,0.5053,0.349731,0.851038,0.720183,0.731935,0.708804
3,0.3348,0.445463,0.863858,0.721598,0.807263,0.65237
4,0.1703,0.629261,0.860806,0.726619,0.774936,0.683973
5,0.073,0.754388,0.862637,0.736225,0.765854,0.708804


***** Running Evaluation *****
  Num examples = 1638
  Batch size = 32
Saving model checkpoint to ./results/baselines/VUA/checkpoint-474
Configuration saved in ./results/baselines/VUA/checkpoint-474/config.json
Model weights saved in ./results/baselines/VUA/checkpoint-474/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1638
  Batch size = 32
Saving model checkpoint to ./results/baselines/VUA/checkpoint-948
Configuration saved in ./results/baselines/VUA/checkpoint-948/config.json
Model weights saved in ./results/baselines/VUA/checkpoint-948/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1638
  Batch size = 32
Saving model checkpoint to ./results/baselines/VUA/checkpoint-1422
Configuration saved in ./results/baselines/VUA/checkpoint-1422/config.json
Model weights saved in ./results/baselines/VUA/checkpoint-1422/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1638
  Batch size = 32
Saving model checkpoint to ./results/baselines/VUA/ch