In [1]:
import csv
import logging
import os
import random
import sys

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
#from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from bertviz.bertviz import attention, visualization
from bertviz.bertviz.pytorch_pretrained_bert import BertModel, BertTokenizer

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
logger = logging.getLogger(__name__)

#bert_classifier_model_dir = "./bert_classifier/" ## Path of BERT classifier model path
#bert_classifier_model_dir = "./data/yelp/bert_classifier_2epochs/"

#bert_classifier_model_dir = "./data/lipton/sentiment/orig/bert_classifier_10epochs8b_490seqlen/"       #Apr 24
#eval_accuracy = 0.9102040816326531  and  eval_loss = 0.35673839559838655  

#bert_classifier_model_dir = "./data/lipton/sentiment/orig/bert_classifier_100epochs16b/"    #apr27
#eval_accuracy = 0.9306122448979591   and eval_loss = 0.6695167317746147

bert_classifier_model_dir = "./data/lipton/sentiment/orig/bert_classifier_100epochs8b_490seqlen/"  #Apr 24
#eval_accuracy = 0.8979591836734694   and eval_loss = 0.9967757850885384           # Try with this one <--

# QUESTION: Whats the difference between loss and acc?  in run_classifier.py

#def accuracy(out, labels):
#    outputs = np.argmax(out, axis=1)
#    return np.sum(outputs == labels)

# eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# eval_sampler = SequentialSampler(eval_data)Run prediction for full data
# eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
# model.eval()

# eval_loss, eval_accuracy, nb_eval_steps, nb_eval_examples = 0, 0, 0, 0

# for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
#    ...
#    tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
#    ...
#    tmp_eval_accuracy = accuracy(logits, label_ids)   #
#    ...
#    eval_loss += tmp_eval_loss.mean().item()
#    eval_accuracy += tmp_eval_accuracy
#    ...
#    nb_eval_examples += input_ids.size(0)
#    nb_eval_steps += 1
# ..
# eval_loss = eval_loss / nb_eval_steps                 <--- loss is sum of batch mean losses / nb_steps
# eval_accuracy = eval_accuracy / nb_eval_examples      <--- acc  is sum of each accuracy / nmb_examples  ( a little more fine grained)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {}, n_gpu {}".format(device, n_gpu))

### CREATE DATA FILES FROM CSVs

In [34]:
#before doing the following I need to create sentiment_train_0 , sentiment_train_1 , etc versions of our training data
def create_classification_file(input_df, output_file_path):
    with open(output_file_path, "w") as out_fp:
        writer = csv.writer(out_fp, delimiter="\t")
        for i in tqdm(range(input_df.shape[0])):
            line = input_df.Text.values[i]
            writer.writerow([line.strip()])
            
import pandas as pd
origd = "data/lipton/sentiment/orig/bert_classifier_training/"
train_df = pd.read_table(origd+"train.csv",sep="\t",header=None,names=["Text","sentiment"])
dev_df = pd.read_table(origd+"dev.csv",sep="\t",header=None,names=["Text","sentiment"])
test_df = pd.read_table(origd+"test.csv",sep="\t",header=None,names=["Text","sentiment"])

train_neg = train_df[train_df.sentiment == 0][["Text"]]
train_pos = train_df[train_df.sentiment == 1][["Text"]]
dev_neg = dev_df[dev_df.sentiment == 0][["Text"]]
dev_pos = dev_df[dev_df.sentiment == 1][["Text"]]
test_neg = test_df[test_df.sentiment == 0][["Text"]]
test_pos = test_df[test_df.sentiment == 1][["Text"]]

create_classification_file(train_neg, origd+"sentiment_train_0.txt" )
create_classification_file(train_pos, origd+"sentiment_train_1.txt" )
create_classification_file(dev_neg, origd+"sentiment_dev_0.txt" )
create_classification_file(dev_pos, origd+"sentiment_dev_1.txt" )
create_classification_file(test_neg, origd+"sentiment_test_0.txt" )
create_classification_file(test_pos, origd+"sentiment_test_1.txt" )

100%|██████████| 851/851 [00:00<00:00, 27865.32it/s]
100%|██████████| 856/856 [00:00<00:00, 30922.55it/s]
100%|██████████| 122/122 [00:00<00:00, 24947.84it/s]
100%|██████████| 123/123 [00:00<00:00, 30449.12it/s]
100%|██████████| 243/243 [00:00<00:00, 32004.52it/s]
100%|██████████| 245/245 [00:00<00:00, 33994.00it/s]


In [58]:
#also need to handle reference data!! 
refd = "./data/lipton/sentiment/new/"
train_df = pd.read_table(refd+"train.tsv",sep="\t")
dev_df = pd.read_table(refd+"dev.tsv",sep="\t")
test_df = pd.read_table(refd+"test.tsv",sep="\t")

rtrain_neg = train_df[train_df.Sentiment == "Negative"][["Text"]]
rtrain_pos = train_df[train_df.Sentiment == "Positive"][["Text"]]
rdev_neg = dev_df[dev_df.Sentiment == "Negative"][["Text"]]
rdev_pos = dev_df[dev_df.Sentiment == "Positive"][["Text"]]
rtest_neg = test_df[test_df.Sentiment == "Negative"][["Text"]]
rtest_pos = test_df[test_df.Sentiment == "Positive"][["Text"]]

create_classification_file(rtrain_neg, refd+"ref_sentiment_train_0.txt" )
create_classification_file(rtrain_pos, refd+"ref_sentiment_train_1.txt" )
create_classification_file(rdev_neg, refd+"ref_sentiment_dev_0.txt" )
create_classification_file(rdev_pos, refd+"ref_sentiment_dev_1.txt" )
create_classification_file(rtest_neg, refd+"ref_sentiment_test_0.txt" )
create_classification_file(rtest_pos, refd+"ref_sentiment_test_1.txt" )

100%|██████████| 856/856 [00:00<00:00, 26320.48it/s]
100%|██████████| 851/851 [00:00<00:00, 30968.64it/s]
100%|██████████| 123/123 [00:00<00:00, 28007.57it/s]
100%|██████████| 122/122 [00:00<00:00, 27377.08it/s]
100%|██████████| 245/245 [00:00<00:00, 31933.02it/s]
100%|██████████| 243/243 [00:00<00:00, 32431.22it/s]


In [None]:
# cat ref_sentiment_train_0.txt ref_sentiment_dev_0.txt ref_sentiment_test_0.txt > reference_0.txt
# cat ref_sentiment_train_1.txt ref_sentiment_dev_1.txt ref_sentiment_test_1.txt > reference_1.txt

# however looking at YELP.. the reference is just against the test set .. so we might just want to do it against
# ref_sentiment_test_0.txt  AND ref_sentiment_test_1.txt

In [None]:
# CONTENTS OF BERT_CLASSIFIER_TRAINING
#diego@microdeep:~/spr20_cf_gen/TDRG/data/lipton/sentiment/orig/bert_classifier_training$ ls
#dev.csv                                   reference_1.txt      sentiment_test_0.txt   sentiment_train_1.txt  tfidf_train1.ann
#processed_files_with_bert_with_best_head  sentiment_dev_0.txt  sentiment_test_1.txt   test.csv               train.csv
#reference_0.txt                           sentiment_dev_1.txt  sentiment_train_0.txt  tfidf_train0.ann

# IMPORTANT:
# FOLDER: bert_classifier_training//processed_files_with_bert_with_best_head/   
#  --> BASED ON (9,5):  bert_classifier_10epochs8b_490seqlen/

# NOW select other BERT MODEL TO use and store info/results in its folder (ie, )


### LOAD DATA FILES

In [3]:
# file paths
#data_dir = "/home/ubuntu/bhargav/data/"
#data_dir = "/home/diego/spr20_cf_gen/TDRG/data/"
#dataset = "yelp" # amazon / yelp / imagecaption

data_dir = "data/lipton/sentiment/orig/"
dataset = "bert_classifier_training/"
refd = "./data/lipton/sentiment/new/"

train_0 = os.path.join(data_dir ,"{}/sentiment_train_0.txt".format(dataset))
train_1 = os.path.join(data_dir,"./{}/sentiment_train_1.txt".format(dataset))
test_0 = os.path.join(data_dir,"./{}/sentiment_test_0.txt".format(dataset))
test_1 = os.path.join(data_dir,"./{}/sentiment_test_1.txt".format(dataset))
dev_0 = os.path.join(data_dir,"./{}/sentiment_dev_0.txt".format(dataset))
dev_1 = os.path.join(data_dir,"./{}/sentiment_dev_1.txt".format(dataset))
reference_0 = os.path.join(refd,"ref_sentiment_test_0.txt")
reference_1 = os.path.join(refd,"ref_sentiment_test_1.txt")

train_0_out = os.path.join(data_dir ,"./{}/processed_files_with_bert_with_best_head/sentiment_train_0.txt".format(dataset))
train_1_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/sentiment_train_1.txt".format(dataset))
test_0_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/sentiment_test_0.txt".format(dataset))
test_1_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/sentiment_test_1.txt".format(dataset))
dev_0_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/sentiment_dev_0.txt".format(dataset))
dev_1_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/sentiment_dev_1.txt".format(dataset))
reference_0_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/reference_0.txt".format(dataset))
reference_1_out = os.path.join(data_dir,"./{}/processed_files_with_bert_with_best_head/reference_1.txt".format(dataset))

In [5]:
"""
## Model for performing Classification
model_cls = BertForSequenceClassification.from_pretrained(bert_classifier_model_dir, num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model_cls.to(device)
model_cls.eval()
"""

## Model to get the attention weights of all the heads
#model = BertModel.from_pretrained(bert_classifier_model_dir)         #from bertviz.bertviz.pytorch_pretrained_bert import BertModel, BertTokenizer

#instead: maybe I should use not the bertviz one?  
model = BertForSequenceClassification.from_pretrained(bert_classifier_model_dir, num_labels=2)
# NO  this way runs out of memory when doing attribution

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
    

In [6]:
#max_seq_len=70 # Maximum sequence length 
max_seq_len = 490
sm = torch.nn.Softmax(dim=-1) ## Softmax over the batch

In [7]:
common_words=['is','are','was','were','has','have','had','a','an','the','this','that','these','those','there','how','i','we',
             'he','she','it','they','them','their','his','him','her','us','our', 'and','in','my','your','you', 'will', 'shall']
common_words_tokens = tokenizer.convert_tokens_to_ids(common_words)
not_to_remove_ids = tokenizer.convert_tokens_to_ids(["[CLS]","[SEP]", ".", "?", "!"])
not_to_remove_ids += common_words_tokens

In [4]:
def read_file(file_path):
    with open(file_path) as fp:
        data = fp.read().splitlines()
    return data

In [42]:
def create_output_file(original_sentences,processed_sentences, output_file, sentiment="<POS>"):
    with open(output_file,"w") as fp:
        for sen1,sen2 in zip(original_sentences,processed_sentences):
            if sen1 != None and sen2 != None:
                str1 = sentiment + " <CON_START> " + sen2 + " <START> " + sen1 + " <END>\n"
                fp.write(str1)

In [43]:
def create_ref_output_file(processed_sentences, output_file, sentiment="<POS>"):
    with open(output_file,"w") as fp:
        for sen in tqdm(processed_sentences):
            if sen != None:
                str1 = sentiment + " <CON_START> " + sen + " <START>\n"
                fp.write(str1)

In [44]:
def concate_files(inp_files, out_files):
    with open(out_files,"w") as fp:
        for file in inp_files:
            with open(file) as f:
                for line in f:
                    fp.write(line)

In [9]:
def run_attn_examples(input_sentences, layer, head, bs=128):
    """
    Returns Attention weights for selected Layer and Head along with ids and tokens
    of the input_sentence
    """
    ids = []
    ids_to_decode = [None for k in range(len(input_sentences))]
    tokens_to_decode = [None for k in range(len(input_sentences))]
    segment_ids = []
    input_masks = []
    attention_weights = [None for z in input_sentences]
    ## BERT pre-processing
    for j,sen in enumerate(tqdm(input_sentences)):
        
        text_tokens = tokenizer.tokenize(sen)
        if len(text_tokens) >= max_seq_len-2:
            text_tokens = text_tokens[:max_seq_len-4]
        tokens = ["[CLS]"] + text_tokens + ["[SEP]"]
        tokens_to_decode[j] = tokens
        temp_ids = tokenizer.convert_tokens_to_ids(tokens)
        ids_to_decode[j] = temp_ids
        input_mask = [1] * len(temp_ids)
        segment_id = [0] * len(temp_ids)
        padding = [0] * (max_seq_len - len(temp_ids))
        
        
        temp_ids += padding
        input_mask += padding
        segment_id += padding
        
        ids.append(temp_ids)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
    
    # Convert Ids to Torch Tensors
    ids = torch.tensor(ids) 
    segment_ids = torch.tensor(segment_ids)
    input_masks = torch.tensor(input_masks)
    
    steps = len(ids) // bs
    
    for i in trange(steps+1):
        if i == steps:
            temp_ids = ids[i * bs : len(ids)]
            temp_segment_ids = segment_ids[i * bs: len(ids)]
            temp_input_masks = input_masks[i * bs: len(ids)]
        else:
            temp_ids = ids[i * bs : i * bs + bs]
            temp_segment_ids = segment_ids[i * bs: i * bs + bs]
            temp_input_masks = input_masks[i * bs: i * bs + bs]
        
        temp_ids = temp_ids.to(device)
        temp_segment_ids = temp_segment_ids.to(device)
        temp_input_masks = temp_input_masks.to(device)
        with torch.no_grad():
             _, _, attn = model(temp_ids, temp_segment_ids, temp_input_masks)
        # Concate Attention weights
        for j in range(len(attn[layer]['attn_probs'])):
            attention_weights[i * bs + j] = (attn[layer]['attn_probs'][j][head][0]).to('cpu')
    
    return attention_weights, ids_to_decode, tokens_to_decode

In [10]:
def prepare_data(aw, ids_to_decode, tokens_to_decode):
    out_sen = [None for i in range(len(aw))]
    for i in trange(len(aw)):        
        topv, topi = aw[i].topk(ids_to_decode[i].index(0))
        topi = topi.tolist()
        topv = topv.tolist()

        #print("Original Top Indexes = {}".format(topi))
        topi = [topi[j] for j in range(len(topi)) if ids_to_decode[i][topi[j]] not in not_to_remove_ids] # remove noun and common words
        #print("After removing Nouns = {}".format(topi))
        
        topi = [topi[j] for j in range(len(topi)) if "##" not in tokens_to_decode[i][topi[j]]] # Remove half words
        #print("After removing Half-words = {}".format(topi))

        # DIEGO:   WHAT IS THIS DOING ? 
        if (len(topi) < 4 and len(topi) > 0):
            topi = [topi[0]]
        elif(len(topi) < 8):
            topi = topi[:2]
        else:
            topi = topi[:3]

        #print("Final Topi = {}".format(topi))
        final_indexes = []
        count = 0
        count1 = 0
        #print(ids_to_decode[i], tokens_to_decode[i])
        while ids_to_decode[i][count] != 0:
            if count in topi:
                while ids_to_decode[i][count + count1 + 1] != 0:
                    if "##" in tokens_to_decode[i][count + count1 + 1]:
                        count1 += 1
                    else:
                        break
                count += count1
                count1 = 0
            else:
                final_indexes.append(ids_to_decode[i][count])
            count += 1

        #print(final_indexes)
        temp_out_sen = tokenizer.convert_ids_to_tokens(final_indexes)
        temp_out_sen = " ".join(temp_out_sen).replace(" ##", "").replace("[CLS]","").replace("[SEP]","")
        #print(temp_out_sen, "\n\n")
        out_sen[i] = temp_out_sen.strip()
    
    return out_sen

In [5]:
train_0_data = read_file(train_0)
train_1_data = read_file(train_1)
dev_0_data = read_file(dev_0)
dev_1_data = read_file(dev_1)
test_0_data = read_file(test_0)
test_1_data = read_file(test_1)
ref_0_data = read_file(reference_0)
ref_1_data = read_file(reference_1)

### Process Data Files to Generate Training/Dev/Test data

In [49]:
#generate files but before hand create folder data/yelp/processed_files_with_bert_with_best_head/
!cd data/lipton/sentiment/orig/bert_classifier_training/; mkdir processed_files_with_bert_with_best_head; ls

dev.csv					  sentiment_test_0.txt	 test.csv
processed_files_with_bert_with_best_head  sentiment_test_1.txt	 train.csv
sentiment_dev_0.txt			  sentiment_train_0.txt
sentiment_dev_1.txt			  sentiment_train_1.txt


In [50]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(train_0_data, layer=9, head=5, bs=16)
train_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(train_0_data, train_0_out_sen, train_0_out, sentiment="<NEG>")

100%|██████████| 851/851 [00:02<00:00, 392.14it/s]
100%|██████████| 54/54 [00:16<00:00,  3.31it/s]
100%|██████████| 851/851 [00:00<00:00, 3949.22it/s]


In [51]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(train_1_data, layer=9, head=5, bs=16)
train_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(train_1_data, train_1_out_sen, train_1_out, sentiment="<POS>")

100%|██████████| 856/856 [00:02<00:00, 381.57it/s]
100%|██████████| 54/54 [00:16<00:00,  3.26it/s]
100%|██████████| 856/856 [00:00<00:00, 3908.56it/s]


In [52]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(dev_0_data, layer=9, head=5, bs=16)
dev_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(dev_0_data, dev_0_out_sen, dev_0_out, sentiment="<NEG>")

100%|██████████| 122/122 [00:00<00:00, 377.07it/s]
100%|██████████| 8/8 [00:02<00:00,  3.36it/s]
100%|██████████| 122/122 [00:00<00:00, 3514.84it/s]


In [53]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(dev_1_data, layer=9, head=5, bs=16)
dev_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(dev_1_data, dev_1_out_sen, dev_1_out, sentiment="<POS>")

100%|██████████| 123/123 [00:00<00:00, 329.91it/s]
100%|██████████| 8/8 [00:02<00:00,  3.34it/s]
100%|██████████| 123/123 [00:00<00:00, 3942.59it/s]


In [54]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(test_1_data, layer=9, head=5, bs=16)
test_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(test_1_data, test_1_out_sen, test_1_out, sentiment="<POS>")

100%|██████████| 245/245 [00:00<00:00, 404.26it/s]
100%|██████████| 16/16 [00:04<00:00,  3.35it/s]
100%|██████████| 245/245 [00:00<00:00, 3975.24it/s]


In [55]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(test_0_data, layer=9, head=5, bs=16)
test_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(test_0_data, test_0_out_sen, test_0_out, sentiment="<NEG>")

100%|██████████| 243/243 [00:00<00:00, 399.37it/s]
100%|██████████| 16/16 [00:04<00:00,  3.38it/s]
100%|██████████| 243/243 [00:00<00:00, 3778.36it/s]


In [None]:
# HAD TO DO REF work

In [63]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(ref_1_data, layer=9, head=5, bs=16)
ref_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
#create_ref_output_file(ref_1_data, ref_1_out_sen, reference_1_out, sentiment="<NEG>")
#--> TypeError: create_ref_output_file() got multiple values for argument 'sentiment'

#doesn't need original ref_1_data 
# def create_ref_output_file(processed_sentences, output_file, sentiment="<POS>"):
create_ref_output_file(ref_1_out_sen, reference_1_out, sentiment="<NEG>")

100%|██████████| 243/243 [00:00<00:00, 347.62it/s]
100%|██████████| 16/16 [00:04<00:00,  3.44it/s]
100%|██████████| 243/243 [00:00<00:00, 3980.57it/s]
100%|██████████| 243/243 [00:00<00:00, 277488.67it/s]


In [64]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(ref_0_data, layer=9, head=5, bs=16)
ref_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
#create_ref_output_file(ref_0_data, ref_0_out_sen, reference_0_out, sentiment="<POS>")
create_ref_output_file(ref_0_out_sen, reference_0_out, sentiment="<POS>")

100%|██████████| 245/245 [00:00<00:00, 346.86it/s]
100%|██████████| 16/16 [00:04<00:00,  3.42it/s]
100%|██████████| 245/245 [00:00<00:00, 3910.84it/s]
100%|██████████| 245/245 [00:00<00:00, 278785.81it/s]


In [None]:
# COMBINE _0  and _1 files WHEN YOU GET BACK

# diego@microdeep:~/spr20_cf_gen/TDRG/data/lipton/sentiment/orig/bert_classifier_training/processed_files_with_bert_with_best_head$ cat sentiment_train_0.txt sentiment_train_1.txt > sentiment_train.txt
# diego@microdeep:~/spr20_cf_gen/TDRG/data/lipton/sentiment/orig/bert_classifier_training/processed_files_with_bert_with_best_head$ cat sentiment_dev_0.txt sentiment_dev_1.txt > sentiment_dev.txt

### Integrated Gradients

In [11]:
# MOVE TO Head_selection bottom when done
# integradted gradients / expected gradients / integrated hessians

# IG using captum
# pip install captum
# https://captum.ai/tutorials/IMDB_TorchText_Interpret  aND  https://captum.ai/docs/extension/integrated_gradients
#!pip install captum

In [6]:
# TAKE 3 based on eval_bert.py
from eval_bert import *
from captum.attr import LayerIntegratedGradients, visualization  #https://captum.ai/tutorials/IMDB_TorchText_Interpret  

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#lig = LayerIntegratedGradients(model, model.bert.embeddings)    #BertForSequenceClassification ( this way runs out of memory !!)

# print(logging.getLoggerClass().root.handlers[0].baseFilename)
# /home/diego/spr20_cf_gen/TDRG/logs/eval_bert.txt

In [63]:
#!pip install fastai
#ERROR: torchvision 0.6.0 has requirement torch==1.5.0, but you'll have torch 1.4.0 which is incompatible.

In [7]:
#handling GPU MEM issues
from fastai.utils.mem import *

#https://docs.fast.ai/dev/gpu.html#cached-memory

import torch
import gc

def show_gpu_use(show_objs=True):
    count = 0
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                if show_objs:
                    print(type(obj), obj.size())
                count +=1
                #del obj
        except: pass    
    #torch.cuda.empty_cache()
    #gc.collect()
    return count
show_gpu_use(show_objs=False)



0

In [17]:
#https://docs.fast.ai/troubleshoot.html#custom-solutions
# If you need a solution for your own code that perhaps doesn’t involve fastai functions, 
# here is a decorator you can use to workaround this issue:

import functools, traceback
def gpu_mem_restore(func):
    "Reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = sys.exc_info()
            traceback.clear_frames(tb)
            raise type(val).with_traceback(tb) from None
    return wrapper
    

In [18]:
@gpu_mem_restore
def pred_sentence(input_sentence, input_labels=["0"], ig_steps=-1, debug=False):    
    if debug:
        print(1,show_gpu_use(show_objs=False))
    
    processors = { "yelp": YelpProcessor, }
    num_labels_task = { "yelp": 2, }
    task_name = "yelp"
    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    lines = []
    for i in range(len(input_sentence)):
        lines.append([input_sentence[i], input_labels[i]])

    eval_examples = processor._create_examples(lines,set_type="dev")
    max_seq_len = 490
    eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_len, tokenizer)
    
    if debug:
        print(2,show_gpu_use(show_objs=False))

    eval_batch_size = 1  # 16   
    
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)
    
    reference_ids = torch.tensor(np.zeros((eval_batch_size, max_seq_len), dtype=int))  #maybe just put 1
    
    if debug:
        print(3,show_gpu_use(show_objs=False))
  
    #model.eval()  #is this actually necessary?

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    ret_preds = []
    ret_attributions = [] 
    
    # attribution score will be computed with respect to target class
    # applying integrated gradients on the SoftmaxModel and input data point

    model = BertForSequenceClassification.from_pretrained(bert_classifier_model_dir, num_labels=2)
    model.to(device)
    model.eval()
    
    if debug:
        print(4,show_gpu_use(show_objs=False))


    lig = LayerIntegratedGradients(model, model.bert.embeddings)    #BertForSequenceClassification ( this way runs out of memory !!)
    if debug:
        print(5,show_gpu_use(show_objs=False))


    # sm = torch.nn.Softmax(dim=-1) ## Softmax over the batch
    # <-- Do this after you finish ICML stuff   HERE
    
    
    # then do similar thing you did but with attributes being removed at different thresholds ( make this a list ) 
    # compare this with attention results ( for both Yelp and Lipton for various thresholds )
    # use auto metrics
    #
    # then move back to B-GST
    #   make similar list of results (train/dev/test) for attention vs ig methods vs expected gradients

    attrbs = []
    
    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)
        reference_ids = reference_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)        
            logits = model(input_ids, segment_ids, input_mask)
            
        logits = logits.detach().cpu().numpy()     #BertSequenceForClassification:  
        label_ids_np = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids_np)
        ret_pred = np.argmax(logits, axis=1).flatten()
        ret_preds.append(ret_pred)
        eval_loss += tmp_eval_loss.mean().item()  #'tuple' object has no attribute 'mean'
        eval_accuracy += tmp_eval_accuracy
        
        if ig_steps != -1:
            attributions_ig, delta = lig.attribute(inputs = input_ids, 
                                                   baselines = reference_ids, 
                                                   target = label_ids,
                                                   n_steps=ig_steps, 
                                                   return_convergence_delta=True)
            attrbs.append([input_ids,logits,ret_pred,attributions_ig,delta])
            
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1
        
        if debug:
            print(5,show_gpu_use(show_objs=False))



    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    #loss = tr_loss/nb_tr_steps if args.do_train else None
    result = {'eval_loss': round(eval_loss,5), 'eval_accuracy': round(eval_accuracy,5), 'steps': nb_eval_steps, 
              'examples': nb_eval_examples, 'preds': ret_preds, 'attributions': attrbs}
        
    if debug:
        print(6,show_gpu_use(show_objs=False))

    # clean up gpu use
    del model
    del lig
    del all_input_ids
    del all_input_mask
    del all_segment_ids
    del all_label_ids
    del reference_ids
    gc.collect()
    torch.cuda.empty_cache()

    if debug:
        print(7,show_gpu_use(show_objs=False))

    return result
    
    
#TODO: HOW TO CLEAR UP PYTORCH MEMORY    
# https://forums.fast.ai/t/clearing-gpu-memory-pytorch/14637
#del(model)
#import gc; 
#gc.collect()
#torch.cuda.empty_cache()  #START USING IF NEED BE
b=gpu_mem_get_used_no_cache()
c=gpu_mem_get_free_no_cache()
print("Prior to Run GPU Used:",b,"Free:",c,"Available:",c - b)
print(show_gpu_use(show_objs=False))
    
n = 5
#look at test
input_sentence = [test_0_data[i] for i in range(n)] + [test_1_data[i] for i in range(n)]
input_label = ["0" for _ in range(n)] + ["1" for _ in range(n)]
#print("Input: ",input_label,input_sentence)
sen_pred = pred_sentence(input_sentence, input_label, ig_steps=5)
print("Predicted", sen_pred['preds'])

print(show_gpu_use(show_objs=False))
b=gpu_mem_get_used_no_cache()
c=gpu_mem_get_free_no_cache()
print("Prior to Run GPU Used:",b,"Free:",c,"Available:",c - b)

Prior to Run GPU Used: 1039 Free: 11156 Available: 10117
61


Evaluating: 100%|██████████| 10/10 [00:04<00:00,  2.31it/s]


Predicted [array([0]), array([0]), array([0]), array([0]), array([0]), array([1]), array([1]), array([1]), array([1]), array([1])]
62
Prior to Run GPU Used: 1647 Free: 10548 Available: 8901


In [19]:
torch.cuda.empty_cache()
print(show_gpu_use(show_objs=False))

62


In [20]:
def sftmax(x):
    return np.exp(x)/sum(np.exp(x))

for i in range(len(sen_pred['preds'])):
    lgts = sen_pred['attributions'][i][1][0]
    shw_lgts = [round(a,2) for a in lgts]
    shw_sfmx = [round(a,7) for a in sftmax(lgts)]
    pred = sen_pred['attributions'][i][2]
    delt = sen_pred['attributions'][i][4].tolist()[0]
    
    #if shw_sfmx[0] > 0.05 and shw_sfmx[0] < .95:
    if 1 == 1:
        #print("\n",input_sentence[i])
        print("\t",i, shw_lgts , shw_sfmx, pred, delt)
        
"""
ON TEST 5/5 split
WITH 1 IG STEP:                               NOTICE THE DELTAS ARE MUCH BIGGER FOR 1 IG STEP COMPARED WITH 10
	 0 [7.3, -7.39] [0.9999996, 4e-07] [0] -9.453010559082031
	 1 [0.72, -0.25] [0.7259604, 0.2740396] [0] -6.152539253234863
	 2 [6.62, -6.75] [0.9999985, 1.6e-06] [0] -6.452980995178223
	 3 [7.3, -7.39] [0.9999996, 4e-07] [0] -9.148890495300293
	 4 [3.55, -3.65] [0.9992534, 0.0007466] [0] -4.98930549621582
	 5 [-6.38, 7.52] [9e-07, 0.999999] [1] -2.8722686767578125
	 6 [-5.48, 6.81] [4.6e-06, 0.9999955] [1] 26.927173614501953
	 7 [-6.56, 7.55] [7e-07, 0.9999992] [1] 46.125030517578125
	 8 [-6.46, 7.54] [8e-07, 0.9999992] [1] 35.248138427734375
	 9 [-6.56, 7.57] [7e-07, 0.9999992] [1] 111.6456298828125
     
WITH 5 IG STEPS:
	 0 [7.3, -7.39] [0.9999996, 4e-07] [0] -4.308740615844727
	 1 [0.72, -0.25] [0.7259604, 0.2740396] [0] -2.8717639446258545
	 2 [6.62, -6.75] [0.9999985, 1.6e-06] [0] -2.3813724517822266
	 3 [7.3, -7.39] [0.9999996, 4e-07] [0] -4.111586570739746
	 4 [3.55, -3.65] [0.9992534, 0.0007466] [0] -4.192013263702393
	 5 [-6.38, 7.52] [9e-07, 0.999999] [1] -4.801794052124023
	 6 [-5.48, 6.81] [4.6e-06, 0.9999955] [1] 3.655503273010254
	 7 [-6.56, 7.55] [7e-07, 0.9999992] [1] 12.80319881439209
	 8 [-6.46, 7.54] [8e-07, 0.9999992] [1] 6.53757381439209
	 9 [-6.56, 7.57] [7e-07, 0.9999992] [1] 28.110918045043945
     
WITH 10 IG STEPS:
	 0 [7.3, -7.39] [0.9999996, 4e-07] [0] -2.7943642139434814
	 1 [0.72, -0.25] [0.7259604, 0.2740396] [0] -2.701509475708008
	 2 [6.62, -6.75] [0.9999985, 1.6e-06] [0] -2.693916082382202
	 3 [7.3, -7.39] [0.9999996, 4e-07] [0] -2.7056050300598145
	 4 [3.55, -3.65] [0.9992534, 0.0007466] [0] 20.584884643554688
	 5 [-6.38, 7.52] [9e-07, 0.999999] [1] -1.3828866481781006
	 6 [-5.48, 6.81] [4.6e-06, 0.9999955] [1] 2.095811367034912
	 7 [-6.56, 7.55] [7e-07, 0.9999992] [1] 2.4854276180267334
	 8 [-6.46, 7.54] [8e-07, 0.9999992] [1] 0.3223910331726074
	 9 [-6.56, 7.57] [7e-07, 0.9999992] [1] -1.2418794631958008
     
With 12 IG STEPS:  #This seems like it does sometimes worse and sometimes better than 10
	 0 [7.3, -7.39] [0.9999996, 4e-07] [0] -3.1482670307159424
	 1 [0.72, -0.25] [0.7259604, 0.2740396] [0] -3.1258959770202637
	 2 [6.62, -6.75] [0.9999985, 1.6e-06] [0] -2.0354671478271484
	 3 [7.3, -7.39] [0.9999996, 4e-07] [0] -2.88800048828125
	 4 [3.55, -3.65] [0.9992534, 0.0007466] [0] 2.9604616165161133
	 5 [-6.38, 7.52] [9e-07, 0.999999] [1] -3.984999418258667
	 6 [-5.48, 6.81] [4.6e-06, 0.9999955] [1] 0.48541831970214844
	 7 [-6.56, 7.55] [7e-07, 0.9999992] [1] 1.0737597942352295
	 8 [-6.46, 7.54] [8e-07, 0.9999992] [1] 0.9192886352539062
	 9 [-6.56, 7.57] [7e-07, 0.9999992] [1] -17.796022415161133
"""
print()

	 0 [7.3, -7.39] [0.9999996, 4e-07] [0] -4.308740615844727
	 1 [0.72, -0.25] [0.7259604, 0.2740396] [0] -2.8717639446258545
	 2 [6.62, -6.75] [0.9999985, 1.6e-06] [0] -2.3813724517822266
	 3 [7.3, -7.39] [0.9999996, 4e-07] [0] -4.111586570739746
	 4 [3.55, -3.65] [0.9992534, 0.0007466] [0] -4.192013263702393
	 5 [-6.38, 7.52] [9e-07, 0.999999] [1] -4.801794052124023
	 6 [-5.48, 6.81] [4.6e-06, 0.9999955] [1] 3.655503273010254
	 7 [-6.56, 7.55] [7e-07, 0.9999992] [1] 12.80319881439209
	 8 [-6.46, 7.54] [8e-07, 0.9999992] [1] 6.53757381439209
	 9 [-6.56, 7.57] [7e-07, 0.9999992] [1] 28.110918045043945



In [11]:
# NEXT STEPS:
# Look at FASTAI COURSE TO DO DISTRIBUTED GPU STUFF ( look at captum as well)
# AND CAPTUM STUFF

# SHOW ATTRIBUTION PER IG AND COMPARE WITH OTHER BRUTE FORCE TECHNIQUE  for YELP ( do this in BERT_DATA_PREP notebook?)
# SHOW ATTRIBUTION PER IG AND COMPARE WITH OTHER BRUTE FORCE TECHNIQUE  for LIPTON ( get eval numbers too)

#from tutorial

vis_data_records_ig = []
def add_to_visualize_attributions(attributions, text, pred, pred_ind, true_label, delta):
    # attributions_ig, delta = lig.attribute(input_indices, reference_indices,  n_steps=500, return_convergence_delta=True)
    # text is list of text tokens, 
    # pred is between 0 and 1 prob score, 
    # pred_ind is 0/1, 
    # label is pos/neg

    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()
    
    vrec = visualization.VisualizationDataRecord(attributions, pred, pred_ind, true_label, "??", attributions.sum(),text,delta)
    vis_data_records_ig.append(vrec)

In [12]:
pred_map = ["neg","pos"]
attr_map = ["input_ids","logits","ret_pred","attributions_ig","delta"]

"""
for k in list(sen_pred.keys()):
    if k != "attributions":
        print(k,":",sen_pred[k])

print("\nAttributions:")
for i in range(len(sen_pred['attributions'][0])):
    cur = sen_pred['attributions'][0][i]
    print(i, attr_map[i],type(cur),cur.shape)
"""

print("")
for i in range(len(sen_pred['preds'])):
    non_zero_token_ids = [int(a) for a in list(sen_pred['attributions'][i][0][0]) if a != 0]
    str_tokens = tokenizer.convert_ids_to_tokens(non_zero_token_ids)
    non_zero_attributes = sen_pred['attributions'][i][3][:,0:len(str_tokens),:]   #TODO  https://discuss.pytorch.org/t/how-to-slice-a-3d-tensor/7411/2
    cur_pred = sen_pred['attributions'][i][2][0]
    cur_pred_lab = pred_map[cur_pred]
    
    lgts = sen_pred['attributions'][i][1][0]
    shw_sfmx = [round(a,7) for a in sftmax(lgts)]
    pred_prob = 1 - shw_sfmx[0]
    true_label = 0
    delta = sen_pred['attributions'][i][4] 

    add_to_visualize_attributions(non_zero_attributes, str_tokens, pred_prob, cur_pred_lab, true_label, delta)

print('Visualize attributions based on Integrated Gradients')

visualization.visualize_text(vis_data_records_ig)


Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,neg (0.00),??,1.32,"[CLS] if you haven ' t seen this , it ' s terrible . it is pure trash . i saw this about 17 years ago , and i ' m still screwed up from it . [SEP]"
,,,,
0.0,neg (0.27),??,-1.14,"[CLS] being a ni supporter , it ' s hard to objective ##ly review a movie g ##lor ##ifying ulster nationalists . characters who are hard to root for , typical heavy - handed anti - violence messages , and a predictable ' poetic justice ' ending makes this an awkward watch . . . [SEP]"
,,,,
0.0,neg (0.00),??,-1.21,"[CLS] nine minutes of psychedelic , pu ##ls ##ating , often symmetric abstract images , are enough to drive anyone crazy . i did spot a full - frame eye at the start , and later some birds silhouette ##d against other colors . it was just not my cup of tea . it ' s about 8 ##½ minutes too long . [SEP]"
,,,,
0.0,neg (0.00),??,0.41,"[CLS] really bad movie . maybe the worst i ' ve ever seen . alien invasion , a la the b ##lo ##b , without the acting . meteor ##ite turns beautiful woman into a host body for nasty tongue . bad plot , bad fake tongue . absurd comedy worth missing . wash your hair or take out the trash . [SEP]"
,,,,
0.0,neg (0.00),??,0.64,"[CLS] i read the novel some years ago and i liked it a lot . when i saw the movie i couldn ' t believe it . . . they changed everything i liked about the novel , even the plot . i wonder what did isabel allen ##de ( author ) say about the movie , but i think it sucks ! ! ! [SEP]"
,,,,


In [None]:
### EVALUATE TRAIN/DEV/TEST data accuracies for trained lipton data based BERT MODEL

In [13]:
# NOW CHECK EVERYTHING HERE ( where do we err and where do we do well )

lists = {"train0" : train_0_data, "train1": train_1_data, "dev0": dev_0_data, "dev1": dev_1_data, "test0": test_0_data, "test1": test_1_data, "ref0": ref_0_data, "ref1": ref_1_data}
lists_preds = {}
for k in list(lists.keys()):
    print("Evaluating ",k)
    cur = lists[k]
    lab = "0" if "0" in k else "1"
    num_ex = len(cur)
    sub_train = [cur[i] for i in range(num_ex)]
    sub_labels = [lab for i in range(num_ex)]
    temp_preds = pred_sentence(sub_train, sub_labels)
    lists_preds[k] = temp_preds
    print(k," Returned: ",temp_preds['eval_accuracy'],temp_preds['examples'])      #{'eval_loss': 0.0, 'eval_accuracy': 1.0, 'steps': 54, 'examples': 856 .. 
    
    
"""   with eval steps = 1                                                                          ACC  NUM_EXMPLS
Evaluating  train0 Evaluating: 100%|██████████| 54/54 [00:32<00:00,  1.65it/s]  train0  Returned:  1.0     851
Evaluating  train1 Evaluating: 100%|██████████| 54/54 [00:35<00:00,  1.53it/s]  train1  Returned:  1.0     856
Evaluating  dev0   Evaluating: 100%|██████████| 8/8 [00:04<00:00,  1.65it/s]      dev0  Returned:  0.88525 122
Evaluating  dev1   Evaluating: 100%|██████████| 8/8 [00:05<00:00,  1.60it/s]      dev1  Returned:  0.91057 123
Evaluating  test0  Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]   test0  Returned:  0.90535 243
Evaluating  test1  Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.58it/s]   test1  Returned:  0.8898  245
Evaluating  ref0   Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.57it/s]    ref0  Returned:  0.88163 245
Evaluating  ref1   Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.59it/s]    ref1  Returned:  0.86831 243

"""  #same results with eval steps = 16
print("")

Evaluating  train0


Evaluating: 100%|██████████| 851/851 [00:43<00:00, 19.49it/s]


train0  Returned:  1.0 851
Evaluating  train1


Evaluating: 100%|██████████| 856/856 [00:45<00:00, 18.70it/s]


train1  Returned:  1.0 856
Evaluating  dev0


Evaluating: 100%|██████████| 122/122 [00:06<00:00, 18.76it/s]


dev0  Returned:  0.88525 122
Evaluating  dev1


Evaluating: 100%|██████████| 123/123 [00:06<00:00, 18.80it/s]


dev1  Returned:  0.91057 123
Evaluating  test0


Evaluating: 100%|██████████| 243/243 [00:13<00:00, 18.62it/s]


test0  Returned:  0.90535 243
Evaluating  test1


Evaluating: 100%|██████████| 245/245 [00:13<00:00, 18.49it/s]


test1  Returned:  0.8898 245
Evaluating  ref0


Evaluating: 100%|██████████| 245/245 [00:13<00:00, 18.45it/s]


ref0  Returned:  0.88163 245
Evaluating  ref1


Evaluating: 100%|██████████| 243/243 [00:13<00:00, 18.48it/s]


ref1  Returned:  0.86831 243


'                                                                                                ACC  NUM_EXMPLS\nEvaluating  train0 Evaluating: 100%|██████████| 54/54 [00:32<00:00,  1.65it/s]  train0  Returned:  1.0     851\nEvaluating  train1 Evaluating: 100%|██████████| 54/54 [00:35<00:00,  1.53it/s]  train1  Returned:  1.0     856\nEvaluating  dev0   Evaluating: 100%|██████████| 8/8 [00:04<00:00,  1.65it/s]      dev0  Returned:  0.88525 122\nEvaluating  dev1   Evaluating: 100%|██████████| 8/8 [00:05<00:00,  1.60it/s]      dev1  Returned:  0.91057 123\nEvaluating  test0  Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]   test0  Returned:  0.90535 243\nEvaluating  test1  Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.58it/s]   test1  Returned:  0.8898  245\nEvaluating  ref0   Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.57it/s]    ref0  Returned:  0.88163 245\nEvaluating  ref1   Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.59it/s]    ref1  Returned:  0.8