In [154]:
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

from collections import Counter
import torch
import numpy as np
import pandas as pd
from IPython.core.display import HTML, display

  from IPython.core.display import HTML, display


In [155]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [156]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3

In [157]:
np.random.seed(123)
df = pd.read_csv('selection_main.csv', usecols=['abstract', 'class'])
df = df.sample(frac=1).reset_index(drop=True)
df.columns = ['review', 'sentiment']
df.head()

Unnamed: 0,review,sentiment
0,The existing volumetric gain for robotic explo...,Robotic
1,Combination therapies have become the standard...,Expert
2,Federated learning (FL) is a privacy-aware dat...,Cyber Security
3,"We propose V-Doc, a question-answering tool us...",Expert
4,HC4 is a new suite of test collections for ad ...,"Information Retrieval, Recommender Systems"


In [158]:
df.review[0]

'The existing volumetric gain for robotic exploration is calculated in the 3D occupancy map, while the sampling-based exploration method is extended in the reachable (free) space. The inconsistency between them makes the existing calculation of volumetric gain inappropriate for a complete exploration of the environment. To address this issue, we propose a concave-hull based volumetric gain in a sampling-based exploration framework. The concave hull is constructed based on the viewpoints generated by Rapidly-exploring Random Tree (RRT) and the nodes that fail to expand. All space outside this concave hull is considered unknown. The volumetric gain is calculated based on the viewpoints configuration rather than using the occupancy map. With the new volumetric gain, robots can avoid inefficient or even erroneous exploration behavior caused by the inappropriateness of existing volumetric gain calculation methods. Our exploration method is evaluated against the existing state-of-the-art RRT

In [159]:
df['sentiment'].value_counts()


Robotic                                       1000
Expert                                        1000
Cyber Security                                1000
Information Retrieval, Recommender Systems    1000
Text Mining                                   1000
Control Systems                               1000
Fuzzy                                         1000
Computer Vision                               1000
Database                                      1000
Neural Nets                                   1000
Name: sentiment, dtype: int64

In [160]:
df['sentiment'].unique()


array(['Robotic', 'Expert', 'Cyber Security',
       'Information Retrieval, Recommender Systems', 'Text Mining',
       'Control Systems', 'Fuzzy', 'Computer Vision', 'Database',
       'Neural Nets'], dtype=object)

In [161]:
for index, row in df.iterrows():
    print(row['review'], row['sentiment'])
    break

The existing volumetric gain for robotic exploration is calculated in the 3D occupancy map, while the sampling-based exploration method is extended in the reachable (free) space. The inconsistency between them makes the existing calculation of volumetric gain inappropriate for a complete exploration of the environment. To address this issue, we propose a concave-hull based volumetric gain in a sampling-based exploration framework. The concave hull is constructed based on the viewpoints generated by Rapidly-exploring Random Tree (RRT) and the nodes that fail to expand. All space outside this concave hull is considered unknown. The volumetric gain is calculated based on the viewpoints configuration rather than using the occupancy map. With the new volumetric gain, robots can avoid inefficient or even erroneous exploration behavior caused by the inappropriateness of existing volumetric gain calculation methods. Our exploration method is evaluated against the existing state-of-the-art RRT-

In [162]:
ag_train, ag_val = df.iloc[:7000], df.iloc[7000:]

tokenizer = get_tokenizer('basic_english')
word_counter = Counter()


In [163]:
for index, row in ag_train.iterrows():
    word_counter.update(tokenizer(str(row)))
voc = Vocab(word_counter)

print('Vocabulary size:', len(voc))

num_class = len(set(index for index, _ in ag_train.iterrows()))
print('Num of classes:', num_class)

Vocabulary size: 15058
Num of classes: 7000


In [164]:
class EmbeddingBagModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, num_class)

    def forward(self, inputs, offsets):
        embedded = self.embedding(inputs, offsets)
        return self.linear(embedded)

In [165]:
BATCH_SIZE = 32

def collate_batch(batch):
    labels = torch.tensor([label - 1 for label, _ in batch])
    text_list = [tokenizer(line) for _, line in batch]

    # flatten tokens across the whole batch
    text = torch.tensor([voc[t] for tokens in text_list for t in tokens])
    # the offset of each example
    offsets = torch.tensor(
        [0] + [len(tokens) for tokens in text_list][:-1]
    ).cumsum(dim=0)

    return labels, text, offsets

train_loader = DataLoader(ag_train, batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(ag_val, batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=collate_batch)

In [176]:
from torch import optim

In [181]:
EPOCHS = 7
EMB_SIZE = 32
#CHECKPOINT = './models/embedding_bag_ag_news.pt'
USE_PRETRAINED = False  # change to False if you want to retrain your own model

for epoch in range(NUM_EPOCHS):

    model.train()

    for batch_idx, batch in enumerate(train_loader):

        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        batch['labels'] = batch['labels'].type(torch.LongTensor)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']

        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')

    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

KeyError: 3777

In [96]:
from sklearn.preprocessing import LabelEncoder

In [97]:
le = LabelEncoder()
le.fit(df['sentiment'])

In [98]:
le.transform(df['sentiment'])
df['sentiment'].value_counts()

Robotic                                       1000
Expert                                        1000
Cyber Security                                1000
Information Retrieval, Recommender Systems    1000
Text Mining                                   1000
Control Systems                               1000
Fuzzy                                         1000
Computer Vision                               1000
Database                                      1000
Neural Nets                                   1000
Name: sentiment, dtype: int64

In [99]:
df['sentiment'] = le.transform(df['sentiment'])
df['sentiment'].value_counts()

8    1000
4    1000
2    1000
6    1000
9    1000
1    1000
5    1000
0    1000
3    1000
7    1000
Name: sentiment, dtype: int64

In [100]:
train_texts = df.iloc[:7000]['review'].values
train_labels = df.iloc[:7000]['sentiment'].values

valid_texts = df.iloc[7000:8000]['review'].values
valid_labels = df.iloc[7000:8000]['sentiment'].values

test_texts = df.iloc[8000:]['review'].values
test_labels = df.iloc[8000:]['sentiment'].values

In [101]:
tokenizer = BertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
#                                           do_lower_case=True)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)
train_encodings[0]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Encoding(num_tokens=476, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [102]:
class IMDbDataset(torch.utils.data.Dataset):
 def __init__(self, encodings, labels):
  self.encodings = encodings
  self.labels = labels

 def __getitem__(self, idx):
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  return item

 def __len__(self):
  return len(self.labels)

In [103]:
train_dataset = IMDbDataset(train_encodings, train_labels)
valid_dataset = IMDbDataset(valid_encodings, valid_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [104]:
model = BertForTokenClassification.from_pretrained('distilbert-base-uncased',
                                                            num_labels=df['sentiment'].nunique(),
                                                            output_attentions=False,
                                                            output_hidden_states=False)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForTokenClassification: ['distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'vocab_transform.weight', 'distilbert.transformer.layer.2.ffn.lin2.weight', 'distilbert.transformer.layer.5.attention.k_lin.weight', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.4.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.q_lin.bi

In [105]:
def predict(inputs,token_type_ids=None,position_ids=None,attention_mask=None):
    output=model(inputs,token_type_ids=token_type_ids,position_ids=position_ids, attention_mask=attention_mask,)
    return output.start_logits, output.end_logits

In [106]:
def squad_pos_forward_func(inputs, token_type_ids=None, position_ids=None, attention_mask=None, position=0):
    pred = predict(inputs,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    pred = pred[position]
    return pred.max(1).values

In [107]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [108]:
def construct_input_ref_pair(question, text, ref_token_id, sep_token_id, cls_token_id):
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    text_ids = tokenizer.encode(text, add_special_tokens=False)

    # construct input token ids
    input_ids = [cls_token_id] + question_ids + [sep_token_id] + text_ids + [sep_token_id]

    # construct reference token ids
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(question_ids) + [sep_token_id] + \
        [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(question_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids

def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

def construct_whole_bert_embeddings(input_ids, ref_input_ids, \
                                    token_type_ids=None, ref_token_type_ids=None, \
                                    position_ids=None, ref_position_ids=None):
    input_embeddings = model.bert.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
    ref_input_embeddings = model.bert.embeddings(ref_input_ids, token_type_ids=ref_token_type_ids, position_ids=ref_position_ids)

    return input_embeddings, ref_input_embeddings

In [109]:
abstract, label = 'The existing volumetric gain for robotic exploration is calculated in the 3D occupancy map, while the sampling-based exploration method is extended in the reachable (free) space. The inconsistency between them makes the existing calculation of volumetric gain inappropriate for a complete exploration of the environment. To address this issue, we propose a concave-hull based volumetric gain in a sampling-based exploration framework. The concave hull is constructed based on the viewpoints generated by Rapidly-exploring Random Tree (RRT) and the nodes that fail to expand. All space outside this concave hull is considered unknown. The volumetric gain is calculated based on the viewpoints configuration rather than using the occupancy map. With the new volumetric gain, robots can avoid inefficient or even erroneous exploration behavior caused by the inappropriateness of existing volumetric gain calculation methods. Our exploration method is evaluated against the existing state-of-the-art RRT-based method in a benchmark environment. In the evaluated environment, the average running time of our method is about 38.4% of the existing state-of-the-art method and our method is more robust.', "Robotic"


In [110]:
input_ids, ref_input_ids, sep_id = construct_input_ref_pair(abstract, label, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

indices = input_ids[0].detach().tolist()
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [111]:
ground_truth = 'Robotic'

ground_truth_tokens = tokenizer.encode(ground_truth, add_special_tokens=False)
ground_truth_end_ind = indices.index(ground_truth_tokens[-1])
ground_truth_start_ind = ground_truth_end_ind - len(ground_truth_tokens) + 1

In [112]:
start_scores, end_scores = predict(input_ids,
                                   token_type_ids=token_type_ids,
                                   position_ids=position_ids,
                                   attention_mask=attention_mask)


print('Question: ', label)
print('Predicted Answer: ', ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))

AttributeError: 'TokenClassifierOutput' object has no attribute 'start_logits'

In [29]:
lig = LayerIntegratedGradients(squad_pos_forward_func, model.bert.embeddings)

attributions_start, delta_start = lig.attribute(inputs=input_ids,
                                  baselines=ref_input_ids,
                                  additional_forward_args=(token_type_ids, position_ids, attention_mask, 0),
                                  return_convergence_delta=True)
attributions_end, delta_end = lig.attribute(inputs=input_ids, baselines=ref_input_ids,
                                additional_forward_args=(token_type_ids, position_ids, attention_mask, 1),
                                return_convergence_delta=True)

AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'bert'