In [1]:
# coding=utf-8

import json
import os
import torch
import numpy as np
from tqdm import tqdm

from bertviz import attention, visualization
from bertviz.pytorch_pretrained_bert import BertModel, BertTokenizer, BertForTokenClassification

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
class AttentionGenerator:

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.model.eval()

    def get_data(self, sentence_a):
        tokens_tensor, token_type_tensor, tokens_a = self._get_inputs(sentence_a)
        attn = self._get_attention(tokens_tensor, token_type_tensor)
        return tokens_a, attn

    def _get_inputs(self, sentence_a):
        tokens_a = self.tokenizer.tokenize(sentence_a)
        tokens_a_delim = ['[CLS]'] + tokens_a + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens_a_delim)
        tokens_tensor = torch.tensor([token_ids])
        token_type_tensor = torch.LongTensor([[0] * len(tokens_a_delim)])
        return tokens_tensor, token_type_tensor, tokens_a_delim

    def _get_attention(self, tokens_tensor, token_type_tensor):
        _, _, attn_data_list = self.model(tokens_tensor, token_type_ids=token_type_tensor)
        attn_tensor = torch.stack([attn_data['attn_probs'] for attn_data in attn_data_list])
        return attn_tensor.data.numpy()

In [9]:
file_path = '../pytorch_pretrained_bert/data_agr/agr.15'
# factor_str = "{is_RC} {DNo_match} {DNr_match}".format(is_RC=1, DNo_match=0, DNr_match=0)
factor_str = "{has_RC} {DN_match} {has_infl_aux}".format(has_RC=1, DN_match=0, has_infl_aux=1)
out = open('agr.15.out', 'w')

bert_version = 'bert-base-uncased'
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=2)
n_layers = model.config.num_hidden_layers
tokenizer = BertTokenizer.from_pretrained(bert_version)
attention_generator = AttentionGenerator(model, tokenizer)

with open(file_path, 'r') as f:
    lines = f.readlines()

bces = np.empty([len(lines), n_layers])
for idx, line in tqdm(enumerate(lines)):

    # preprocess lines
    line = line.strip().split('\t')
    sentence, source, *target_groups = line
    source_idx = int(source) + 1 # offset for [CLS]
    target_groups_idx = []
    for group in target_groups:
        str_idxes = group.strip().split()
        target_groups_idx.append(list(map(lambda s: int(s) + 1, str_idxes))) # offset for [CLS]

    tokens, attn = attention_generator.get_data(sentence)
    source_attn = np.empty([n_layers])
    for layer in range(n_layers):
        layer_attn = attn[layer][0] # Get layer attention (assume batch size = 1), shape = [num_heads, seq_len, seq_len]
        head_avg = np.mean(layer_attn, axis=0) # shape = [seq_len, seq_len]
        grouped_attn = [head_avg[source_idx, group].sum() for group in target_groups_idx]
        grouped_attn /= sum(grouped_attn)
        source_attn[layer] = grouped_attn[0]

    bce = -np.log2(source_attn)
    bces[idx] = bce
    for layer_idx, bce_val in enumerate(bce, 1):
        out.write(f"{factor_str} {layer_idx} {bce_val}\n")

out.close()

corpus_avg_bce = np.mean(bces, axis=0)
corpus_avg_summed_bce = np.sum(corpus_avg_bce)


10000it [05:46, 28.89it/s]


In [10]:
corpus_avg_bce

array([1.24790663, 0.98991765, 0.99917777, 1.24925629, 0.92088941,
       1.28900114, 0.95009588, 1.09005471, 0.89251214, 0.81285282,
       0.85832825, 0.84508687])

In [12]:
corpus_avg_summed_bce

12.145079574666749

In [13]:
corpus_avg_summed_bce/len(corpus_avg_bce)

1.0120899645555623

In [14]:
file_path = '../pytorch_pretrained_bert/data_refl/refl.5'
factor_str = "{is_RC} {DNo_match} {DNr_match}".format(is_RC=1, DNo_match=0, DNr_match=0)
# factor_str = "{has_RC} {DN_match} {has_infl_aux}".format(has_RC=1, DN_match=0, has_infl_aux=1)
out = open('refl.5.out', 'w')

bert_version = 'bert-base-uncased'
model = BertForTokenClassification.from_pretrained(bert_version, num_labels=2)
n_layers = model.config.num_hidden_layers
tokenizer = BertTokenizer.from_pretrained(bert_version)
attention_generator = AttentionGenerator(model, tokenizer)

with open(file_path, 'r') as f:
    lines = f.readlines()

bces = np.empty([len(lines), n_layers])
for idx, line in tqdm(enumerate(lines)):

    # preprocess lines
    line = line.strip().split('\t')
    sentence, source, *target_groups = line
    source_idx = int(source) + 1 # offset for [CLS]
    target_groups_idx = []
    for group in target_groups:
        str_idxes = group.strip().split()
        target_groups_idx.append(list(map(lambda s: int(s) + 1, str_idxes))) # offset for [CLS]

    tokens, attn = attention_generator.get_data(sentence)
    source_attn = np.empty([n_layers])
    for layer in range(n_layers):
        layer_attn = attn[layer][0] # Get layer attention (assume batch size = 1), shape = [num_heads, seq_len, seq_len]
        head_avg = np.mean(layer_attn, axis=0) # shape = [seq_len, seq_len]
        grouped_attn = [head_avg[source_idx, group].sum() for group in target_groups_idx]
        grouped_attn /= sum(grouped_attn)
        source_attn[layer] = grouped_attn[0]

    bce = -np.log2(source_attn)
    bces[idx] = bce
    for layer_idx, bce_val in enumerate(bce, 1):
        out.write(f"{factor_str} {layer_idx} {bce_val}\n")

out.close()

corpus_avg_bce = np.mean(bces, axis=0)
corpus_avg_summed_bce = np.sum(corpus_avg_bce)


10000it [06:29, 25.69it/s]


In [15]:
corpus_avg_bce

array([1.83850494, 1.66179948, 1.60509775, 1.86446394, 1.52999158,
       1.43328184, 1.3906293 , 1.84089164, 1.42146948, 1.42791497,
       1.40542546, 1.39968409])

In [16]:
corpus_avg_summed_bce

18.819154444445207

In [17]:
corpus_avg_summed_bce/len(corpus_avg_bce)

1.568262870370434