#### Run relevance backout here

In [1]:
import pickle
import re
import os

import random
import numpy as np
import torch
from random import shuffle
import argparse
import pickle

import collections

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import sys
sys.path.append("..")

from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from tqdm import tqdm, trange

from util.optimization import BERTAdam
from util.processor import *


from util.tokenization import *

from util.evaluation import *

from util.train_helper import *

import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

from sklearn.metrics import classification_report

# this imports most of the helpers needed to eval the model
from run_classifier import *

sys.path.append("..")
import operator

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

RETRAIN = False
vocab_data_dir = "../../models/BERT-Google/vocab.txt"

#### Set-ups

In [2]:
# Note that this notebook only supports single GPU evaluation
# which is sufficient for most of tasks by using lower batch size.
IS_CUDA = False
if IS_CUDA:
    CUDA_DEVICE = "cuda:0"
    device = torch.device(CUDA_DEVICE)
    n_gpu = torch.cuda.device_count()
    logger.info("device %s in total n_gpu %d distributed training", device, n_gpu)
else:
    # bad luck, we are on CPU now!
    logger.info("gpu is out of the picture, let us use CPU")
    device = torch.device("cpu")
    
def inverse_mapping(vocab_dict):
    inverse_vocab_dict = {}
    for k, v in vocab_dict.items():
        inverse_vocab_dict[v] = k
    return inverse_vocab_dict

def translate(token_ids, vocab):
    tokens = []
    for _id in token_ids.tolist():
        tokens.append(vocab[_id])
    return tokens

def heatmap_viz(token_grad, vmin=0, vmax=1, cmap="Blues"):
    scores = [tu[1] for tu in token_grad]
    tokens = [tu[0] for tu in token_grad]
    fig, ax = plt.subplots(figsize=(10,1))
    ax = sns.heatmap([scores], cmap=cmap, xticklabels=tokens, yticklabels=False,
                     cbar_kws=dict(shrink=1, aspect=4, ), linewidths=0.8, vmin=vmin, vmax=vmax)
    ax.set_xticklabels(tokens, size = 18)
    cbar = ax.collections[0].colorbar
    # here set the labelsize by 20
    cbar.ax.tick_params(labelsize=20)
    plt.show()
    
def evaluate_with_hooks(test_dataloader, model, device, label_list):

    # we did not exclude gradients, for attribution methods
    model.eval() # this line will deactivate dropouts
    test_loss, test_accuracy = 0, 0
    nb_test_steps, nb_test_examples = 0, 0
    pred_logits = []
    actual = []

    gs_scores = []
    gi_scores = []
    lrp_scores = []
    lat_scores = []

    inputs_ids = []
    seqs_lens = []

    # we don't need gradient in this case.
    for _, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
        input_ids, input_mask, segment_ids, label_ids, seq_lens = batch
        # truncate to save space and computing resource
        max_seq_lens = max(seq_lens)[0]
        input_ids = input_ids[:,:max_seq_lens]
        input_mask = input_mask[:,:max_seq_lens]
        segment_ids = segment_ids[:,:max_seq_lens]

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        seq_lens = seq_lens.to(device)

        # intentially with gradient
        tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
            model(input_ids, segment_ids, input_mask, seq_lens,
                    device=device, labels=label_ids)
        logits_raw = F.softmax(logits, dim=-1)

        logits = logits_raw.detach().cpu().numpy()
        pred_logits.append(logits)
        label_ids = label_ids.to('cpu').numpy()
        actual.append(label_ids)
        outputs = np.argmax(logits, axis=1)
        tmp_test_accuracy=np.sum(outputs == label_ids)
        
        sensitivity_class = len(label_list) - 1

        # GS
        gs_score = torch.zeros(logits.shape)
        gs_score[:, sensitivity_class] = 1.0
        gs_score = model.backward_gradient(gs_score)
        gs_score = torch.norm(gs_score, dim=-1)*torch.norm(gs_score, dim=-1)
        gs_scores.append(gs_score)
        
        # GI
        gi_score = torch.zeros(logits.shape)
        gi_score[:, sensitivity_class] = 1.0
        gi_score = model.backward_gradient_input(gi_score)
        gi_score = torch.norm(gi_score, dim=-1)*torch.norm(gi_score, dim=-1)
        gi_scores.append(gi_score)

        
        # lrp
        Rout_mask = torch.zeros((input_ids.shape[0], len(label_list))).to(device)
        Rout_mask[:, sensitivity_class] = 1.0
        relevance_score = logits_raw*Rout_mask
        lrp_score = model.backward_lrp(relevance_score)
        lrp_score = lrp_score.cpu().detach().data
        lrp_score = torch.abs(lrp_score).sum(dim=-1)
        lrp_scores.append(lrp_score)

        # lat
        attention_scores = model.backward_lat(input_ids, all_encoder_attention_scores)
        lat_scores.append(attention_scores.sum(dim=-1))

        # other meta-data
        input_ids = input_ids.cpu().data
        seq_lens = seq_lens.cpu().data
        inputs_ids.append(input_ids)
        seqs_lens.append(seq_lens)

        test_loss += tmp_test_loss.mean().item()
        test_accuracy += tmp_test_accuracy

        nb_test_examples += input_ids.size(0)
        nb_test_steps += 1

    test_loss = test_loss / nb_test_steps
    test_accuracy = test_accuracy / nb_test_examples

    result = collections.OrderedDict()
    result = {'test_loss': test_loss,
                str(len(label_list))+ '-class test_accuracy': test_accuracy}
    logger.info("***** Eval results *****")
    for key in result.keys():
        logger.info("  %s = %s\n", key, str(result[key]))
    # get predictions needed for evaluation
    pred_logits = np.concatenate(pred_logits, axis=0)
    actual = np.concatenate(actual, axis=0)
    pred_label = np.argmax(pred_logits, axis=-1)

    attribution_scores_state_dict = dict()
    attribution_scores_state_dict["inputs_ids"] = inputs_ids
    attribution_scores_state_dict["seqs_lens"] = seqs_lens
    attribution_scores_state_dict["gs_scores"] = gs_scores
    attribution_scores_state_dict["gi_scores"] = gi_scores
    attribution_scores_state_dict["lrp_scores"] = lrp_scores
    attribution_scores_state_dict["lat_scores"] = lat_scores

    logger.info("***** Finish Attribution Backouts *****")
    return attribution_scores_state_dict

def analysis_task(task_name, device, sentence_limit=5000):
    """
    We need to set a limit otherwise it takes too long!
    """
    TASK_NAME = task_name
    lrp_data_dir = "../../results"
    vocab_data_dir = "../../models/BERT-Google/vocab.txt"
    DATA_DIR = "../../datasets/" + TASK_NAME + "/"

    # "../../data/uncased_L-12_H-768_A-12/" is for the default BERT-base pretrain
    BERT_PATH = "../../models/BERT-Google/"
    MODEL_PATH = "../../results/" + TASK_NAME + "/best_checkpoint.bin"
    EVAL_BATCH_SIZE = 24 # you can tune this down depends on GPU you have.

    # This loads the task processor for you.
    processors = {
        "SST5": SST5_Processor,
        "SemEval" : SemEval_Processor,
        "IMDb" : IMDb_Processor,
        "Yelp5" : Yelp5_Processor
    }

    processor = processors[TASK_NAME]()
    label_list = processor.get_labels()
    
    model, tokenizer, optimizer = \
        load_model_setups(vocab_file=BERT_PATH + "vocab.txt",
                           bert_config_file=BERT_PATH + "bert_config.json",
                           init_checkpoint=MODEL_PATH,
                           label_list=label_list,
                           num_train_steps=20,
                           do_lower_case=True,
                           # below is not required for eval
                           learning_rate=2e-5,
                           warmup_proportion=0.1,
                           init_lrp=True)
    model = model.to(device) # send the model to device
    
    test_examples = processor.get_test_examples(DATA_DIR, sentence_limit=sentence_limit)
    test_features = \
        convert_examples_to_features(
            test_examples,
            label_list,
            128,
            tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
    all_seq_len = torch.tensor([[f.seq_len] for f in test_features], dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids, all_seq_len)

    test_dataloader = DataLoader(test_data, batch_size=EVAL_BATCH_SIZE, shuffle=False)
    
    score_dict = evaluate_with_hooks(test_dataloader, model, device, label_list)
    
    return score_dict

def find_common_vocab(dict_list):
    assert len(dict_list) > 0
    common_vocab = set(dict_list[0].keys())
    for i in range(1, len(dict_list)):
        common_vocab = common_vocab.intersection(set(dict_list[i].keys()))
    return common_vocab

def subset_score(dict_list):
    common_vocab = find_common_vocab(dict_list)
    per_word_score = []
    for word in common_vocab:
        word_score = []
        for d in dict_list:
            word_score.append(d[word])
        per_word_score.append(word_score)
    return np.transpose(np.array(per_word_score)) 

def load_attribution_scores(vocab_data_dir, inputs_ids, seqs_lens, raw_attribution_scores, min_freq=1, 
                            consider_speicial_tokens=False, normalized=True, min_length=0):
    vocab = inverse_mapping(load_vocab(vocab_data_dir, pretrain=False))
    word_lrp = {}
    word_lrp_list = []
    sentence_lrp = []
    for batch_idx in range(len(inputs_ids)):
        for seq_idx in range(inputs_ids[batch_idx].shape[0]):
            seq_len = seqs_lens[batch_idx][seq_idx].tolist()[0]
            if consider_speicial_tokens:
                tokens = translate(inputs_ids[batch_idx][seq_idx], vocab)[:seq_len]
                attribution_scores = raw_attribution_scores[batch_idx][seq_idx][:seq_len]
            else:
                tokens = translate(inputs_ids[batch_idx][seq_idx], vocab)[:seq_len][1:-1]
                attribution_scores = raw_attribution_scores[batch_idx][seq_idx][:seq_len][1:-1] 
            if normalized:
                sentence_attribution_scores = F.softmax(torch.abs(attribution_scores), dim=-1).tolist()
            else:
                sentence_attribution_scores = attribution_scores.tolist()
            if len(tokens) >= min_length:
                assert(len(tokens) == len(sentence_attribution_scores))
                s_lrp = list(zip(tokens, sentence_attribution_scores))
                sentence_lrp.append(s_lrp)
                for i in range(len(s_lrp)):
                    token = s_lrp[i][0]
                    score = s_lrp[i][1]
                    word_lrp_list.append((token, score))
                    if token in word_lrp.keys():
                        word_lrp[token].append(score)
                    else:
                        word_lrp[token] = [score]

    filter_word_lrp = {}
    for k, v in word_lrp.items():
        if len(v) > min_freq:
            filter_word_lrp[k] = sum(v)*1.0/len(v)
    filter_word_lrp = [(k, v) for k, v in filter_word_lrp.items()] 
    filter_word_lrp.sort(key = lambda x: x[1], reverse=True)  
    word_lrp_list.sort(key = lambda x: x[1], reverse=True)
    return filter_word_lrp, word_lrp_list, sentence_lrp

def load_attribution_meta(vocab_data_dir, dataset_dict):
    attribution_meta = {}
    for item in ["gs_scores", "gi_scores", \
                 "lrp_scores", "lat_scores"]:
        filtered_word_rank, raw_word_rank, sentence_revelance_score = \
            load_attribution_scores(vocab_data_dir,
                                    dataset_dict["inputs_ids"], 
                                    dataset_dict["seqs_lens"],
                                    dataset_dict[item])
        attribution_meta[item] = {"filtered_word_rank": filtered_word_rank, 
                                  "raw_word_rank": raw_word_rank, 
                                  "sentence_revelance_score": sentence_revelance_score}
    return attribution_meta

def print_topk_words(attribution_meta, k=30, filtered=True):
    """
    print top k words for a dataset
    """
    from tabulate import tabulate
    words = []
    words_neg = []
    index = 0
    for i in range(0, k):
        item_words = []
        item_words_neg = []
        for item in ["gs_scores", "gi_scores", \
                     "lrp_scores", "lat_scores"]:
            
            word_rank = None
            if filtered:
                word_rank = attribution_meta[item]["filtered_word_rank"]
            else:
                word_rank = attribution_meta[item]["raw_word_rank"]
            item_words.append((word_rank[i][0], str(word_rank[i][1])[:4]))
            item_words_neg.append((word_rank[-(i+1)][0], str(word_rank[-(i+1)][1])[:4]))

        words.append(item_words)
        words_neg.append(item_words_neg) # reversed ranking

    print(tabulate(words, headers=["gs_scores", "gi_scores", "lrp_scores", "lat_scores"]))
    print("***")
    print(tabulate(words_neg, headers=["gs_scores", "gi_scores", "lrp_scores", "lat_scores"]))

12/23/2020 02:26:22 - INFO - run_classifier -   gpu is out of the picture, let us use CPU


### Experiment 3.2.1 SST-5 Word Rank

In [None]:
sst5_dict = analysis_task("SST5", device, sentence_limit=2000)

In [None]:
sst5_attribution_meta = load_attribution_meta(vocab_data_dir, sst5_dict)

In [None]:
print_topk_words(sst5_attribution_meta)

In [None]:
# heatmap_viz(sentence_lrps[i], vmin=0, vmax=1)

### Exp. 3.2.2 Word deletion experiments

In [68]:
a = torch.rand(3,3)
a[[]].shape[0]

0

In [89]:
from random import randrange

def random_drop(input_ids, seq_lens, k=1):
    for b in range(input_ids.shape[0]):
        if k > seq_lens[b][0]:
            input_ids[b] = 0. # zero out all of them
        else:
            zero_out_idx = random.sample(range(1, seq_lens[b][0]), k)
            for idx in zero_out_idx:
                input_ids[b][idx] = 0.
    return input_ids

def topk_drop(input_ids, scores, k=1):
    if k > input_ids.shape[1]-2:
        input_ids = 0.
    else:
        _, idx = torch.topk(scores[:,1:-1], k, dim=-1)
        idx = idx + 1
        for b in range(input_ids.shape[0]):
            input_ids[b, idx[b]] = 0.
    return input_ids

def evaluate_with_word_deletion(test_dataloader, model, device, label_list, 
                                k=0, del_type="gi", 
                                original_correct=True):

    # we did not exclude gradients, for attribution methods
    model.eval() # this line will deactivate dropouts
    test_loss, test_accuracy = 0, 0
    nb_test_steps, nb_test_examples = 0, 0
    pred_logits = []
    actual = []

    inputs_ids = []
    seqs_lens = []

    # we don't need gradient in this case.
    for _, batch in enumerate(tqdm(test_dataloader, desc="Iteration")):
        input_ids, input_mask, segment_ids, label_ids, seq_lens = batch
        # truncate to save space and computing resource
        max_seq_lens = max(seq_lens)[0]
        input_ids = input_ids[:,:max_seq_lens]
        input_mask = input_mask[:,:max_seq_lens]
        segment_ids = segment_ids[:,:max_seq_lens]

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        seq_lens = seq_lens.to(device)
        
        sensitivity_class = len(label_list) - 1
        
        tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
            model(input_ids, segment_ids, input_mask, seq_lens,
                    device=device, labels=label_ids)
        logits_raw = F.softmax(logits, dim=-1)
        logits = logits_raw.detach().cpu().numpy()
        label_ids_raw = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        tmp_idx_correct = outputs == label_ids_raw
        tmp_idx_correct = tmp_idx_correct.nonzero()[0]
        tmp_idx_wrong = outputs != label_ids_raw
        tmp_idx_wrong = tmp_idx_wrong.nonzero()[0]

        if original_correct:
            # select only those that correct
            new_input_ids = input_ids[tmp_idx_correct]
            new_segment_ids = segment_ids[tmp_idx_correct]
            new_input_mask = input_mask[tmp_idx_correct]
            new_seq_lens = seq_lens[tmp_idx_correct]
            new_label_ids = label_ids[tmp_idx_correct]
        else:
            # select only those that are wrong
            new_input_ids = input_ids[tmp_idx_wrong]
            new_segment_ids = segment_ids[tmp_idx_wrong]
            new_input_mask = input_mask[tmp_idx_wrong]
            new_seq_lens = seq_lens[tmp_idx_wrong]
            new_label_ids = label_ids[tmp_idx_wrong]
            
        # corner case handling, if this batch contains no examples, we bypass
        if new_input_ids.shape[0] == 0:
            continue

        if k == 0: # no need to drop
            tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                        device=device, labels=new_label_ids)
        else:
            if del_type == "random":
                # Random dropouts
                new_input_ids = random_drop(new_input_ids, new_seq_lens, k=k)
                tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                    model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                            device=device, labels=new_label_ids)
            elif del_type == "gs":
                # GS dropouts
                gs_score = torch.zeros(logits.shape)
                gs_score[:, sensitivity_class] = 1.0
                gs_score = model.backward_gradient(gs_score)
                gs_score = torch.norm(gs_score, dim=-1)*torch.norm(gs_score, dim=-1)
                if original_correct:
                    new_gs_score = gs_score[tmp_idx_correct]
                else:
                    new_gs_score = gs_score[tmp_idx_wrong]
                # rerun
                new_input_ids = topk_drop(new_input_ids, new_gs_score, k=k)
                tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                    model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                            device=device, labels=new_label_ids)
            elif del_type == "gi":
                # GI dropouts
                gi_score = torch.zeros(logits.shape)
                gi_score[:, sensitivity_class] = 1.0
                gi_score = model.backward_gradient_input(gi_score)
                gi_score = torch.norm(gi_score, dim=-1)*torch.norm(gi_score, dim=-1)
                if original_correct:
                    new_gi_score = gi_score[tmp_idx_correct]
                else:
                    new_gi_score = gi_score[tmp_idx_wrong]
                # rerun
                new_input_ids = topk_drop(new_input_ids, new_gi_score, k=k)
                tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                    model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                            device=device, labels=new_label_ids)
            elif del_type == "lrp":
                # lrp dropouts
                Rout_mask = torch.zeros((input_ids.shape[0], len(label_list))).to(device)
                Rout_mask[:, sensitivity_class] = 1.0
                relevance_score = logits_raw*Rout_mask
                lrp_score = model.backward_lrp(relevance_score)
                lrp_score = lrp_score.cpu().detach().data
                lrp_score = torch.abs(lrp_score).sum(dim=-1)
                if original_correct:
                    new_lrp_score = lrp_score[tmp_idx_correct]
                else:
                    new_lrp_score = lrp_score[tmp_idx_wrong]
                # rerun
                new_input_ids = topk_drop(new_input_ids, new_lrp_score, k=k)
                tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                    model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                            device=device, labels=new_label_ids)
            elif del_type == "lat":
                # lat dropouts
                attention_scores = model.backward_lat(input_ids, all_encoder_attention_scores)
                attention_scores = attention_scores.sum(dim=-1)
                if original_correct:
                    new_attention_scores = attention_scores[tmp_idx_correct]
                else:
                    new_attention_scores = attention_scores[tmp_idx_wrong]
                # rerun
                new_input_ids = topk_drop(new_input_ids, new_attention_scores, k=k)
                tmp_test_loss, logits, all_encoder_attention_scores, embedding_output = \
                    model(new_input_ids, new_segment_ids, new_input_mask, new_seq_lens,
                            device=device, labels=new_label_ids)

        logits_raw = F.softmax(logits, dim=-1)
        logits = logits_raw.detach().cpu().numpy()
        new_label_ids = new_label_ids.to('cpu').numpy()
        outputs = np.argmax(logits, axis=1)
        tmp_test_accuracy=np.sum(outputs == new_label_ids)

        test_loss += tmp_test_loss.mean().item()
        test_accuracy += tmp_test_accuracy

        nb_test_examples += new_input_ids.size(0)
        nb_test_steps += 1

    test_loss = test_loss / nb_test_steps
    test_accuracy = test_accuracy / nb_test_examples

    result = collections.OrderedDict()
    result = {'test_loss': test_loss,
                str(len(label_list))+ '-class test_accuracy': test_accuracy}
    logger.info("***** Eval results *****")
    for key in result.keys():
        logger.info("  %s = %s\n", key, str(result[key]))

def word_deletion_task(task_name, device, sentence_limit=2000, 
                       k=0, del_type="random",
                       original_correct=True):
    """
    We need to set a limit otherwise it takes too long!
    """
    TASK_NAME = task_name
    lrp_data_dir = "../../results"
    vocab_data_dir = "../../models/BERT-Google/vocab.txt"
    DATA_DIR = "../../datasets/" + TASK_NAME + "/"

    # "../../data/uncased_L-12_H-768_A-12/" is for the default BERT-base pretrain
    BERT_PATH = "../../models/BERT-Google/"
    MODEL_PATH = "../../results/" + TASK_NAME + "/best_checkpoint.bin"
    EVAL_BATCH_SIZE = 24 # you can tune this down depends on GPU you have.

    # This loads the task processor for you.
    processors = {
        "SST5": SST5_Processor,
        "SemEval" : SemEval_Processor,
        "IMDb" : IMDb_Processor,
        "Yelp5" : Yelp5_Processor
    }

    processor = processors[TASK_NAME]()
    label_list = processor.get_labels()
    
    model, tokenizer, optimizer = \
        load_model_setups(vocab_file=BERT_PATH + "vocab.txt",
                           bert_config_file=BERT_PATH + "bert_config.json",
                           init_checkpoint=MODEL_PATH,
                           label_list=label_list,
                           num_train_steps=20,
                           do_lower_case=True,
                           # below is not required for eval
                           learning_rate=2e-5,
                           warmup_proportion=0.1,
                           init_lrp=True)
    model = model.to(device) # send the model to device
    
    test_examples = processor.get_test_examples(DATA_DIR, sentence_limit=sentence_limit)
    test_features = \
        convert_examples_to_features(
            test_examples,
            label_list,
            128,
            tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
    all_seq_len = torch.tensor([[f.seq_len] for f in test_features], dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids, all_seq_len)

    test_dataloader = DataLoader(test_data, batch_size=EVAL_BATCH_SIZE, shuffle=False)
    
    evaluate_with_word_deletion(test_dataloader, model, device, label_list, 
                                k=k, del_type=del_type, 
                                original_correct=original_correct)

In [70]:
# processors = {
#     "SST5": SST5_Processor,
#     "SemEval" : SemEval_Processor,
#     "IMDb" : IMDb_Processor,
#     "Yelp5" : Yelp5_Processor
# }
for i in range(1, 6):
    print("===== Word Deletion with K=%s ====="%(i))
    word_deletion_task("SST5", device, sentence_limit=2000, 
                       k=i, del_type="random", 
                       original_correct=True)

12/23/2020 11:55:02 - INFO - util.train_helper -   model = BERT
12/23/2020 11:55:02 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:55:02 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=0 =====
init_weight = True
init_lrp = True


 21%|██▏       | 430/2001 [00:00<00:00, 4290.73it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4170.91it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.72it/s]
12/23/2020 11:55:35 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:55:35 - INFO - run_classifier -     test_loss = 0.3908445890105906

12/23/2020 11:55:35 - INFO - run_classifier -     5-class test_accuracy = 1.0

12/23/2020 11:55:35 - INFO - util.train_helper -   model = BERT
12/23/2020 11:55:35 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:55:35 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=1 =====
init_weight = True
init_lrp = True


 23%|██▎       | 458/2001 [00:00<00:00, 4579.45it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3682.43it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.64it/s]
12/23/2020 11:56:08 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:56:08 - INFO - run_classifier -     test_loss = 0.4975708861436163

12/23/2020 11:56:08 - INFO - run_classifier -     5-class test_accuracy = 0.8979057591623036

12/23/2020 11:56:08 - INFO - util.train_helper -   model = BERT
12/23/2020 11:56:08 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:56:08 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=2 =====
init_weight = True
init_lrp = True


 23%|██▎       | 451/2001 [00:00<00:00, 4503.64it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4096.77it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.73it/s]
12/23/2020 11:56:41 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:56:41 - INFO - run_classifier -     test_loss = 0.6247674389964059

12/23/2020 11:56:41 - INFO - run_classifier -     5-class test_accuracy = 0.8298429319371727

12/23/2020 11:56:41 - INFO - util.train_helper -   model = BERT
12/23/2020 11:56:41 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:56:41 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=3 =====
init_weight = True
init_lrp = True


 21%|██        | 422/2001 [00:00<00:00, 4213.24it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4139.57it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.73it/s]
12/23/2020 11:57:14 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:57:14 - INFO - run_classifier -     test_loss = 0.7756362706422806

12/23/2020 11:57:14 - INFO - run_classifier -     5-class test_accuracy = 0.7609075043630017

12/23/2020 11:57:14 - INFO - util.train_helper -   model = BERT
12/23/2020 11:57:14 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:57:14 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=4 =====
init_weight = True
init_lrp = True


 20%|█▉        | 400/2001 [00:00<00:00, 3999.44it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3723.45it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.70it/s]
12/23/2020 11:57:47 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:57:47 - INFO - run_classifier -     test_loss = 0.9476450390758968

12/23/2020 11:57:47 - INFO - run_classifier -     5-class test_accuracy = 0.6535776614310645

12/23/2020 11:57:47 - INFO - util.train_helper -   model = BERT
12/23/2020 11:57:47 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:57:47 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=5 =====
init_weight = True
init_lrp = True


 22%|██▏       | 448/2001 [00:00<00:00, 4477.63it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4319.54it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.73it/s]
12/23/2020 11:58:20 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:58:20 - INFO - run_classifier -     test_loss = 1.2040626619543349

12/23/2020 11:58:20 - INFO - run_classifier -     5-class test_accuracy = 0.5959860383944153

12/23/2020 11:58:20 - INFO - util.train_helper -   model = BERT
12/23/2020 11:58:20 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:58:20 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=6 =====
init_weight = True
init_lrp = True


 19%|█▉        | 387/2001 [00:00<00:00, 3862.19it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4009.68it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.74it/s]
12/23/2020 11:58:53 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:58:53 - INFO - run_classifier -     test_loss = 1.34529063247499

12/23/2020 11:58:53 - INFO - run_classifier -     5-class test_accuracy = 0.5680628272251309

12/23/2020 11:58:53 - INFO - util.train_helper -   model = BERT
12/23/2020 11:58:53 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:58:53 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=7 =====
init_weight = True
init_lrp = True


 22%|██▏       | 445/2001 [00:00<00:00, 4440.58it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4502.43it/s]
Iteration: 100%|██████████| 84/84 [00:30<00:00,  2.74it/s]
12/23/2020 11:59:25 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:59:25 - INFO - run_classifier -     test_loss = 1.5258239699261529

12/23/2020 11:59:25 - INFO - run_classifier -     5-class test_accuracy = 0.5104712041884817

12/23/2020 11:59:25 - INFO - util.train_helper -   model = BERT
12/23/2020 11:59:25 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:59:25 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=8 =====
init_weight = True
init_lrp = True


 21%|██        | 413/2001 [00:00<00:00, 4124.92it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4145.70it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.68it/s]
12/23/2020 11:59:59 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 11:59:59 - INFO - run_classifier -     test_loss = 1.6759203381481624

12/23/2020 11:59:59 - INFO - run_classifier -     5-class test_accuracy = 0.4668411867364747

12/23/2020 11:59:59 - INFO - util.train_helper -   model = BERT
12/23/2020 11:59:59 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 11:59:59 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=9 =====
init_weight = True
init_lrp = True


 20%|██        | 404/2001 [00:00<00:00, 4035.57it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4251.21it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.69it/s]
12/23/2020 12:00:32 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 12:00:32 - INFO - run_classifier -     test_loss = 1.9307754124913896

12/23/2020 12:00:32 - INFO - run_classifier -     5-class test_accuracy = 0.4223385689354276

12/23/2020 12:00:32 - INFO - util.train_helper -   model = BERT
12/23/2020 12:00:32 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 12:00:32 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=10 =====
init_weight = True
init_lrp = True


 21%|██        | 413/2001 [00:00<00:00, 4127.98it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3754.97it/s]
Iteration:  65%|██████▌   | 55/84 [00:20<00:10,  2.65it/s]


KeyboardInterrupt: 

In [92]:
for i in range(1, 6):
    print("===== Word Deletion with K=%s ====="%(i))
    word_deletion_task("SST5", device, sentence_limit=2000, 
                       k=i, del_type="gs", 
                       original_correct=True)

12/23/2020 16:55:40 - INFO - util.train_helper -   model = BERT
12/23/2020 16:55:40 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:55:40 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=1 =====
init_weight = True
init_lrp = True


  6%|▋         | 130/2001 [00:00<00:01, 1092.70it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3595.54it/s]
Iteration: 100%|██████████| 84/84 [01:00<00:00,  1.38it/s]
12/23/2020 16:56:43 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:56:43 - INFO - run_classifier -     test_loss = 0.9258443450643903

12/23/2020 16:56:43 - INFO - run_classifier -     5-class test_accuracy = 0.6928446771378709

12/23/2020 16:56:43 - INFO - util.train_helper -   model = BERT
12/23/2020 16:56:43 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:56:43 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=2 =====
init_weight = True
init_lrp = True


 22%|██▏       | 432/2001 [00:00<00:00, 4314.94it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4323.90it/s]
Iteration: 100%|██████████| 84/84 [01:02<00:00,  1.35it/s]
12/23/2020 16:57:47 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:57:47 - INFO - run_classifier -     test_loss = 1.219329908490181

12/23/2020 16:57:47 - INFO - run_classifier -     5-class test_accuracy = 0.606457242582897

12/23/2020 16:57:47 - INFO - util.train_helper -   model = BERT
12/23/2020 16:57:47 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:57:47 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=3 =====
init_weight = True
init_lrp = True


 23%|██▎       | 463/2001 [00:00<00:00, 4621.78it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4447.84it/s]
Iteration:  12%|█▏        | 10/84 [00:07<00:58,  1.27it/s]


KeyboardInterrupt: 

In [73]:
for i in range(1, 6):
    print("===== Word Deletion with K=%s ====="%(i))
    word_deletion_task("SST5", device, sentence_limit=2000, 
                       k=i, del_type="gi", 
                       original_correct=True)

12/23/2020 12:21:53 - INFO - util.train_helper -   model = BERT
12/23/2020 12:21:53 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 12:21:53 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=1 =====
init_weight = True
init_lrp = True


 21%|██        | 420/2001 [00:00<00:00, 4199.27it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3985.35it/s]
Iteration: 100%|██████████| 84/84 [01:07<00:00,  1.24it/s]
12/23/2020 12:23:03 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 12:23:03 - INFO - run_classifier -     test_loss = 1.4514927438327245

12/23/2020 12:23:03 - INFO - run_classifier -     5-class test_accuracy = 0.0

12/23/2020 12:23:03 - INFO - util.train_helper -   model = BERT
12/23/2020 12:23:03 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 12:23:03 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=2 =====
init_weight = True
init_lrp = True


 22%|██▏       | 436/2001 [00:00<00:00, 4350.29it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4236.37it/s]
Iteration:   5%|▍         | 4/84 [00:03<01:19,  1.00it/s]


KeyboardInterrupt: 

In [91]:
for i in range(1, 6):
    print("===== Word Deletion with K=%s ====="%(i))
    word_deletion_task("SST5", device, sentence_limit=2000, 
                       k=i, del_type="lrp", 
                       original_correct=True)

12/23/2020 16:33:17 - INFO - util.train_helper -   model = BERT
12/23/2020 16:33:17 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:33:17 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=1 =====
init_weight = True
init_lrp = True


 21%|██        | 424/2001 [00:00<00:00, 4237.43it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4412.77it/s]
Iteration: 100%|██████████| 84/84 [01:35<00:00,  1.14s/it]
12/23/2020 16:34:55 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:34:55 - INFO - run_classifier -     test_loss = 0.6154856820191655

12/23/2020 16:34:55 - INFO - run_classifier -     5-class test_accuracy = 0.8534031413612565

12/23/2020 16:34:55 - INFO - util.train_helper -   model = BERT
12/23/2020 16:34:55 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:34:55 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=2 =====
init_weight = True
init_lrp = True


 21%|██        | 424/2001 [00:00<00:00, 4237.27it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4422.32it/s]
Iteration: 100%|██████████| 84/84 [01:35<00:00,  1.14s/it]
12/23/2020 16:36:32 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:36:32 - INFO - run_classifier -     test_loss = 0.8014448458949724

12/23/2020 16:36:32 - INFO - run_classifier -     5-class test_accuracy = 0.7486910994764397

12/23/2020 16:36:32 - INFO - util.train_helper -   model = BERT
12/23/2020 16:36:32 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:36:32 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=3 =====
init_weight = True
init_lrp = True


 23%|██▎       | 451/2001 [00:00<00:00, 4506.18it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4023.91it/s]
Iteration: 100%|██████████| 84/84 [01:33<00:00,  1.12s/it]
12/23/2020 16:38:08 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:38:08 - INFO - run_classifier -     test_loss = 1.0009076559827441

12/23/2020 16:38:08 - INFO - run_classifier -     5-class test_accuracy = 0.6614310645724258

12/23/2020 16:38:08 - INFO - util.train_helper -   model = BERT
12/23/2020 16:38:08 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:38:08 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=4 =====
init_weight = True
init_lrp = True


 21%|██        | 420/2001 [00:00<00:00, 4195.43it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4076.95it/s]
Iteration: 100%|██████████| 84/84 [01:33<00:00,  1.12s/it]
12/23/2020 16:39:44 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:39:44 - INFO - run_classifier -     test_loss = 1.2607152242036093

12/23/2020 16:39:44 - INFO - run_classifier -     5-class test_accuracy = 0.5724258289703316

12/23/2020 16:39:44 - INFO - util.train_helper -   model = BERT
12/23/2020 16:39:44 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:39:44 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=5 =====
init_weight = True
init_lrp = True


 23%|██▎       | 457/2001 [00:00<00:00, 4565.15it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4485.11it/s]
Iteration: 100%|██████████| 84/84 [01:34<00:00,  1.13s/it]
12/23/2020 16:41:21 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:41:21 - INFO - run_classifier -     test_loss = 1.5426092126539774

12/23/2020 16:41:21 - INFO - run_classifier -     5-class test_accuracy = 0.4965095986038394



In [90]:
for i in range(1, 6):
    print("===== Word Deletion with K=%s ====="%(i))
    word_deletion_task("SST5", device, sentence_limit=2000, 
                       k=i, del_type="lat", 
                       original_correct=True)

12/23/2020 16:30:10 - INFO - util.train_helper -   model = BERT
12/23/2020 16:30:10 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:30:10 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=1 =====
init_weight = True
init_lrp = True


 22%|██▏       | 448/2001 [00:00<00:00, 4475.20it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4393.29it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.67it/s]
12/23/2020 16:30:43 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:30:43 - INFO - run_classifier -     test_loss = 0.5505172489654451

12/23/2020 16:30:43 - INFO - run_classifier -     5-class test_accuracy = 0.8132635253054101

12/23/2020 16:30:43 - INFO - util.train_helper -   model = BERT
12/23/2020 16:30:43 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:30:43 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=2 =====
init_weight = True
init_lrp = True


 21%|██        | 415/2001 [00:00<00:00, 2409.76it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3479.84it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.63it/s]
12/23/2020 16:31:17 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:31:17 - INFO - run_classifier -     test_loss = 0.6573992627007621

12/23/2020 16:31:17 - INFO - run_classifier -     5-class test_accuracy = 0.7478184991273996

12/23/2020 16:31:17 - INFO - util.train_helper -   model = BERT
12/23/2020 16:31:17 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:31:17 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=3 =====
init_weight = True
init_lrp = True


 19%|█▉        | 379/2001 [00:00<00:00, 3772.23it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 3461.02it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.67it/s]
12/23/2020 16:31:51 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:31:51 - INFO - run_classifier -     test_loss = 0.8385252998698325

12/23/2020 16:31:51 - INFO - run_classifier -     5-class test_accuracy = 0.6867364746945899

12/23/2020 16:31:51 - INFO - util.train_helper -   model = BERT
12/23/2020 16:31:51 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:31:51 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=4 =====
init_weight = True
init_lrp = True


 22%|██▏       | 445/2001 [00:00<00:00, 4448.46it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4189.23it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.65it/s]
12/23/2020 16:32:24 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:32:24 - INFO - run_classifier -     test_loss = 1.163046630365508

12/23/2020 16:32:24 - INFO - run_classifier -     5-class test_accuracy = 0.6134380453752182

12/23/2020 16:32:24 - INFO - util.train_helper -   model = BERT
12/23/2020 16:32:24 - INFO - util.train_helper -   *** Model Config ***
12/23/2020 16:32:24 - INFO - util.train_helper -   {
  "attention_probs_dropout_prob": 0.1,
  "full_pooler": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



===== Word Deletion with K=5 =====
init_weight = True
init_lrp = True


 23%|██▎       | 459/2001 [00:00<00:00, 4589.67it/s]

sentence limit= 2000
0
guid= test-0
text_a= no movement , no yuks , not much of anything .
text_b= None
label= 1


100%|██████████| 2001/2001 [00:00<00:00, 4232.49it/s]
Iteration: 100%|██████████| 84/84 [00:31<00:00,  2.63it/s]
12/23/2020 16:32:58 - INFO - run_classifier -   ***** Eval results *****
12/23/2020 16:32:58 - INFO - run_classifier -     test_loss = 1.4864931887104398

12/23/2020 16:32:58 - INFO - run_classifier -     5-class test_accuracy = 0.5279232111692844



### Exp. 3.4 Correlations across datasets
Due to the memory limitation and cache limitations, we want run these analysis function 1 at a time to avoid failure.

In [None]:
sst5_dict = analysis_task("SST5", device)
torch.save(sst5_dict, "./sst5_dict.pt")
semeval_dict = analysis_task("SemEval", device)
torch.save(semeval_dict, "./semeval.pt")
imdb_dict = analysis_task("IMDb", device)
torch.save(imdb_dict, "./imdb.pt")
yelp5_dict = analysis_task("Yelp5", device)
torch.save(yelp5_dict, "./yelp5.pt")

In [None]:
if RETRAIN:
    sst5_dict = analysis_task("SST5", device)
    torch.save(sst5_dict, "./sst5_dict.pt")
    semeval_dict = analysis_task("SemEval", device)
    torch.save(semeval_dict, "./semeval.pt")
    imdb_dict = analysis_task("IMDb", device)
    torch.save(imdb_dict, "./imdb.pt")
    yelp5_dict = analysis_task("Yelp5", device)
    torch.save(yelp5_dict, "./yelp5.pt")
else:
    sst5_dict = torch.load("./sst5_dict.pt")
    semeval_dict = torch.load("./semeval.pt")
    imdb_dict = torch.load("./imdb.pt")
    yelp5_dict = torch.load("./yelp5.pt")

In [None]:
sst5_word_to_score = load_word_score(vocab_data_dir, 
                                     sst5_dict["inputs_ids"], 
                                     sst5_dict["seqs_lens"],
                                     sst5_dict["grad_scores"])
semeval_word_to_score = load_word_score(vocab_data_dir, 
                                     semeval_dict["inputs_ids"], 
                                     semeval_dict["seqs_lens"],
                                     semeval_dict["grad_scores"])
imdb_word_to_score = load_word_score(vocab_data_dir, 
                                     imdb_dict["inputs_ids"], 
                                     imdb_dict["seqs_lens"],
                                     imdb_dict["grad_scores"])
yelp5_word_to_score = load_word_score(vocab_data_dir, 
                                     yelp5_dict["inputs_ids"], 
                                     yelp5_dict["seqs_lens"],
                                     yelp5_dict["grad_scores"])

In [None]:
score_list = subset_score([sst5_word_to_score, semeval_word_to_score, imdb_word_to_score, yelp5_word_to_score])
score_df = pd.DataFrame({"sst5": sst5_word_to_score, "semeval": semeval_word_to_score,
                         "imdb": imdb_word_to_score, "yelp5": yelp5_word_to_score})

In [None]:
score_df[:10]

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['axes.edgecolor'] = "black"

fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
plt.rcParams["font.family"] = "Times New Roman"

y = score_list[1]
x = score_list[0]

plt.scatter(x, y, marker='*', color='r')
plt.tight_layout()
plt.grid(color='black', linestyle='-.')
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color='black')


In [None]:
from scipy.stats import pearsonr
def reg_coef(x,y,label=None,color=None,**kwargs):
    ax = plt.gca()
    r,p = pearsonr(x,y)
    ax.annotate('r = {:.2f}'.format(r), xy=(0.5,0.5), xycoords='axes fraction', ha='center', size=30)
    ax.set_axis_off()

g = sns.PairGrid(score_df)
g.map_diag(sns.distplot)
g.map_lower(sns.regplot, marker="+", line_kws={"color": "red"})
g.map_upper(reg_coef)