#### Set-ups and Imports

In [3]:
import pickle
import re
import os

import random
import numpy as np
import torch
from random import shuffle
import argparse
import pickle

import collections

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import sys
sys.path.append("..")

from model.BERT import *

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from tqdm import tqdm, trange

from util.optimization import BERTAdam
from util.processor import *

from util.tokenization import *

from util.evaluation import *

import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

from sklearn.metrics import classification_report

# this imports most of the helpers needed to eval the model
from run_classifier import *

In [4]:
# Note that this notebook only supports single GPU evaluation
# which is sufficient for most of tasks by using lower batch size.
IS_CUDA = False
if IS_CUDA:
    CUDA_DEVICE = "cuda:5"
    device = torch.device(CUDA_DEVICE)
    n_gpu = torch.cuda.device_count()
    logger.info("device %s in total n_gpu %d distributed training", device, n_gpu)
else:
    # bad luck, we are on CPU now!
    logger.info("gpu is out of the picture, let us use CPU")
    device = torch.device("cpu")

10/30/2020 19:33:11 - INFO - run_classifier -   gpu is out of the picture, let us use CPU


#### Eval

In [10]:
TASK_NAME = "AdvSA"
            
# "../../data/uncased_L-12_H-768_A-12/" is for the default BERT-base pretrain
BERT_PATH = "../../data/uncased_L-12_H-768_A-12/"
MODEL_PATH = "../../results/" + TASK_NAME + "/checkpoint.bin"
EVAL_BATCH_SIZE = 24 # you can tune this down depends on GPU you have.

# This loads the task processor for you.
processors = {
    "IMDb":IMDb_Processor,
    "SemEval":SemEval_Processor,
    "SST5":SST5_Processor,
    "SST2":SST2_Processor,
    "SST3":SST3_Processor,
    "Yelp5":Yelp5_Processor,
    "Yelp2":Yelp2_Processor,
    "AdvSA":AdvSA_Processor,
    "R0Train":R0Train_Processor
}

processor = processors[TASK_NAME]()
label_list = processor.get_labels()

#### Get specific models, optimizer (not needed), and tokenizer

In [6]:
model, optimizer, tokenizer = \
    getModelOptimizerTokenizer(model_type="BERTPretrain",
                               vocab_file=BERT_PATH + "vocab.txt",
                               embed_file=None,
                               bert_config_file=BERT_PATH + "bert_config.json",
                               init_checkpoint=MODEL_PATH,
                               label_list=label_list,
                               do_lower_case=True,
                               # below is not required for eval
                               num_train_steps=20,
                               learning_rate=2e-5,
                               base_learning_rate=2e-5,
                               warmup_proportion=0.1,
                               init_lrp=True)
model = model.to(device) # send the model to device

10/30/2020 19:33:50 - INFO - run_classifier -   model = BERTPretrain


init_weight = True
init_lrp = True


#### Trained with Yelp dataset only, and evaluate on Yelp only

In [7]:
DATA_DIR = "../../data/dataset/AdvSA/"
test_examples = processor.get_dev_examples(DATA_DIR)
test_features = \
    convert_examples_to_features(
        test_examples,
        label_list,
        512,
        tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
all_seq_len = torch.tensor([[f.seq_len] for f in test_features], dtype=torch.long)

test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                          all_label_ids, all_seq_len)
test_dataloader = DataLoader(test_data, batch_size=EVAL_BATCH_SIZE, shuffle=False)

  0%|          | 0/4500 [00:00<?, ?it/s]

0
guid= dev-0
text_a= Too expensive?
text_b= None
label= 0
1000
guid= dev-1000
text_a= I'm not sure where to start.
text_b= None
label= 2
2000
guid= dev-2000
text_a= I have a 16 year old dog who is having seizures and basically transitioning to the next stage of life.
text_b= None
label= 0
3000
guid= dev-3000
text_a= Probably your best plan of action.
text_b= None
label= 1
4000
guid= dev-4000
text_a= Shoot, they should have 3 way everything, 3 way dogs, 3 way salad, 3 way sliders and 3 way pizza.
text_b= None
label= 1


100%|██████████| 4500/4500 [00:01<00:00, 2426.07it/s]


####  Actual evaluation loop

In [8]:
# we did not exclude gradients, for attribution methods
model.eval() # this line will deactivate dropouts
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
pred_logits = []
actual = []
# we don't need gradient in this case.
for step, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    input_ids, input_mask, segment_ids, label_ids, seq_lens = batch
    # truncate to save space and computing resource
    max_seq_lens = max(seq_lens)[0]
    input_ids = input_ids[:,:max_seq_lens]
    input_mask = input_mask[:,:max_seq_lens]
    segment_ids = segment_ids[:,:max_seq_lens]

    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    seq_lens = seq_lens.to(device)

    # intentially with gradient
    tmp_test_loss, logits = \
        model(input_ids, segment_ids, input_mask, seq_lens,
                device=device, labels=label_ids)

    logits = F.softmax(logits, dim=-1)
    logits = logits.detach().cpu().numpy()
    pred_logits.append(logits)
    label_ids = label_ids.to('cpu').numpy()
    actual.append(label_ids)
    outputs = np.argmax(logits, axis=1)
    tmp_test_accuracy=np.sum(outputs == label_ids)

    test_loss += tmp_test_loss.mean().item()
    test_accuracy += tmp_test_accuracy

    nb_test_examples += input_ids.size(0)
    nb_test_steps += 1
    
test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = collections.OrderedDict()
result = {'test_loss': test_loss,
            str(len(label_list))+ '-class test_accuracy': test_accuracy}
logger.info("***** Eval results *****")
for key in result.keys():
    logger.info("  %s = %s\n", key, str(result[key]))
# get predictions needed for evaluation
pred_logits = np.concatenate(pred_logits, axis=0)
actual = np.concatenate(actual, axis=0)
pred_label = np.argmax(pred_logits, axis=-1)

Iteration: 100%|██████████| 188/188 [00:50<00:00,  3.71it/s]
10/30/2020 19:35:12 - INFO - run_classifier -   ***** Eval results *****
10/30/2020 19:35:12 - INFO - run_classifier -     test_loss = 1.5029023293643555

10/30/2020 19:35:12 - INFO - run_classifier -     3-class test_accuracy = 0.6364444444444445



In [47]:
print(classification_report(actual, pred_label))

              precision    recall  f1-score   support

           0       0.68      0.38      0.49      1500
           1       0.53      0.63      0.57      1500
           2       0.57      0.71      0.63      1500

    accuracy                           0.57      4500
   macro avg       0.59      0.57      0.57      4500
weighted avg       0.59      0.57      0.57      4500



#### Evaluation with SST tenary on phrase level

In [11]:
DATA_DIR = "../../data/dataset/R0Train/"
processor = processors["R0Train"]()
label_list = processor.get_labels()

test_examples = processor.get_test_examples(DATA_DIR)
test_features = \
    convert_examples_to_features(
        test_examples,
        label_list,
        512,
        tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
all_seq_len = torch.tensor([[f.seq_len] for f in test_features], dtype=torch.long)

test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                          all_label_ids, all_seq_len)
test_dataloader = DataLoader(test_data, batch_size=EVAL_BATCH_SIZE, shuffle=False)

0
guid= test-0
text_a= It 's a lovely film with lovely performances by Buy and Accorsi .
text_b= None
label= 1


100%|██████████| 24772/24772 [00:05<00:00, 4663.00it/s]


In [12]:
# we did not exclude gradients, for attribution methods
model.eval() # this line will deactivate dropouts
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
pred_logits = []
actual = []
# we don't need gradient in this case.
for step, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    input_ids, input_mask, segment_ids, label_ids, seq_lens = batch
    # truncate to save space and computing resource
    max_seq_lens = max(seq_lens)[0]
    input_ids = input_ids[:,:max_seq_lens]
    input_mask = input_mask[:,:max_seq_lens]
    segment_ids = segment_ids[:,:max_seq_lens]

    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
    seq_lens = seq_lens.to(device)

    # intentially with gradient
    tmp_test_loss, logits = \
        model(input_ids, segment_ids, input_mask, seq_lens,
                device=device, labels=label_ids)

    logits = F.softmax(logits, dim=-1)
    logits = logits.detach().cpu().numpy()
    pred_logits.append(logits)
    label_ids = label_ids.to('cpu').numpy()
    actual.append(label_ids)
    outputs = np.argmax(logits, axis=1)
    tmp_test_accuracy=np.sum(outputs == label_ids)

    test_loss += tmp_test_loss.mean().item()
    test_accuracy += tmp_test_accuracy

    nb_test_examples += input_ids.size(0)
    nb_test_steps += 1
    
test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples

result = collections.OrderedDict()
result = {'test_loss': test_loss,
            str(len(label_list))+ '-class test_accuracy': test_accuracy}
logger.info("***** Eval results *****")
for key in result.keys():
    logger.info("  %s = %s\n", key, str(result[key]))
# get predictions needed for evaluation
pred_logits = np.concatenate(pred_logits, axis=0)
actual = np.concatenate(actual, axis=0)
pred_label = np.argmax(pred_logits, axis=-1)

Iteration: 100%|██████████| 1033/1033 [02:50<00:00,  6.07it/s]
10/30/2020 19:41:54 - INFO - run_classifier -   ***** Eval results *****
10/30/2020 19:41:54 - INFO - run_classifier -     test_loss = 1.3414304166633748

10/30/2020 19:41:54 - INFO - run_classifier -     3-class test_accuracy = 0.6829888583885031



In [13]:
print(classification_report(actual, pred_label))

              precision    recall  f1-score   support

           0       0.63      0.46      0.53      5105
           1       0.58      0.72      0.64      6290
           2       0.76      0.75      0.75     13377

    accuracy                           0.68     24772
   macro avg       0.66      0.64      0.64     24772
weighted avg       0.69      0.68      0.68     24772



#### What about we look at 2-way accuracy by ignoring the thrid case

In [21]:
def two_way_metrics(pred_logits, actual):
    actual_two_way = []
    pred_logits_two_way = []
    for i in range(len(pred_logits)):
        if actual[i] == 2:
            # ignore neut cases
            continue
        actual_two_way.append(actual[i])
        pred_logits_two_way.append(pred_logits[i][:2])
    pred_label_two_way = np.argmax(pred_logits_two_way, axis=-1)
    return classification_report(actual_two_way, pred_label_two_way)

In [22]:
print(two_way_metrics(pred_logits, actual))

              precision    recall  f1-score   support

           0       0.90      0.56      0.69      5105
           1       0.73      0.95      0.82      6290

    accuracy                           0.78     11395
   macro avg       0.81      0.76      0.76     11395
weighted avg       0.80      0.78      0.77     11395

