## Extract Perplexity & Attention Weights

In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

In [2]:
import utilities

In [15]:
# Tokenizer and model used throughout
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert_mlm/block_512/bert_mlm_textbook", output_attentions=True)
# Init softmax to get probabilities later on
softmax = torch.nn.Softmax(dim=0)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fb639f0d710>

In [4]:
# Contexts fed into BERT must start with a [CLS] token and (possibly?) end with a [SEP] token
mask_token, mask_id = tokenizer.mask_token, tokenizer.mask_token_id
cls_token, cls_id = tokenizer.cls_token, tokenizer.cls_token_id
sep_token, sep_id = tokenizer.sep_token, tokenizer.sep_token_id

In [5]:
# Relevant paths
config = "_128_50" # window size _ max distance 
results_folder = "temporal_attn_examples" + config + "/"
low_prob_filename = "pr_0.25.txt"
mid_prob_filename = "pr_0.45_0.55.txt"
high_prob_filename = "pr_0.9999.txt"

In [6]:
# opposing pronouns dictionary
man_words = ['man', 'men', 'male', 'he', 'him', 'his']
woman_words = ['woman', 'women', 'female', 'she', 'her', 'hers']
pronoun_oppos = dict()
for i, man_word in enumerate(man_words):
    pronoun_oppos[man_word] = woman_words[i]
    pronoun_oppos[woman_words[i]] = man_word

In [7]:
# pronoun sets
man_words_set = set(['man', 'men', 'male', 'he', 'him', 'his'])
woman_words_set = set(['woman', 'women', 'female', 'she', 'her', 'hers'])

In [8]:
def prepare_mask(sentence_data):
    tokens_tensor, segments_tensor, tokenized_text, sentence_info, norm_prob = sentence_data
    tokens_tensor = torch.tensor(tokens_tensor)
    segments_tensor = torch.tensor(segments_tensor)
    gender_index, query_index, gender_word, query_word = sentence_info
    tokenized_text[gender_index] = mask_token
    tokens_tensor[0][gender_index] = mask_id
    return tokens_tensor, tokenized_text

In [16]:
def get_attention_and_probs(inputs, masked_position):
    # Forward
    outputs = model(inputs)
    attention = outputs.attentions  # Output includes attention weights when output_attentions=True
    last_hidden_state = outputs[0].squeeze(0)
    # Only get output for masked token (output is the size of the vocabulary)
    mask_hidden_state = last_hidden_state[masked_position]
    # Convert to probabilities (softmax), giving a probability for each item in the vocabulary
    probs = softmax(mask_hidden_state)
    return attention, probs

In [10]:
def get_norm_prob(probs, gender_word):
    man_prob = 0
    woman_prob = 0
    for m_word in man_words_set:
        pronoun_id = tokenizer.convert_tokens_to_ids(m_word)
        man_prob += probs[pronoun_id].item()
    for w_word in woman_words_set:
        pronoun_id = tokenizer.convert_tokens_to_ids(w_word)
        woman_prob += probs[pronoun_id].item()
    gender_prob = man_prob if gender_word in man_words_set else woman_prob
    opp_gender_prob = woman_prob if gender_word in man_words_set else man_prob

    norm_prob = gender_prob / (gender_prob + opp_gender_prob)
    correctness = 1 if norm_prob > 0.5 else 0
    
    return norm_prob

#     top_word = torch.argmax(probs)
#     print('Top Prediction:')
#     print(tokenizer.decode(top_word), 'probability', probs[top_word].item())

## Dissecting Attention Examples

Take lowest perplexities and figure out which word has the highest attention with the masked word across each layer?
Also need to do for highest perplexity and also see if that's any different! (ie if high perplexity the masked word doesn't attend to the masked word the most or something?

Take the higher norm probabilities ones (BERT is predicting the right gender!) and see if there aren't any corefs or syntactic clues; see if either pronoun could be used. Then check attention heads. Compare to lower norm probabilities one in the same case. Could reveal that certain interest words are non-gendered? 

Layer with the max attention weight?

- remember to mask out the pronoun before feeding it into attn viz (as seen in python notebook!)
- there's two values u could theoretically look at: weight of <masked gender pronoun> attending to <interest word>, and vice versa. i think the prior makes more sense, since we want to see how much bert is looking at <interest word> when predicting the gender of the masked word! 
- the above attn analysis pairs well with our 'perplexity' experiment. but another angle we wanted to investigate was a follow-up to our cosine plots. for these, perhaps u could feed in the original sentence (with the gender pronoun UNMASKED), and inspect the weight of <interest word> attneding to <gender pronoun> (note: flopped compared to experiment 1). this would possibly give us insight into whether the interst word is gendered.

In [11]:
def get_stacked_attention(attention):
    att_weights = []
    for att_layer in attention:
        layer = att_layer.squeeze()
        layer_weights = layer[:, pronoun_idx, interest_idx].numpy()
        att_weights.append(layer_weights)
    att_weights = np.stack(att_weights, axis=0)

In [20]:
data = utilities.read_context_windows(results_folder + low_prob_filename)
for sentence_data in data:
    tokens_tensor, segments_tensor, tokenized_text, sentence_info, norm_prob = sentence_data
    gender_index, query_index, gender_word, query_word = sentence_info
    tokens_tensor, tokenized_text = prepare_mask(sentence_data)
    attention, probs = get_attention_and_probs(tokens_tensor, gender_index)
    print(norm_prob)
    print(get_norm_prob(probs, gender_word))
    print(tokenized_text, gender_index)
    assert(norm_prob == get_norm_prob(probs, gender_word))
    

0.05010960765663714
0.1267267181875414
['[CLS]', 'e', 'activities', 'in', 'much', 'of', 'europe', 'not', 'real', 'work', '.', 'because', 'indian', 'women', 'worked', 'in', 'the', 'fields', 'europeans', 'often', 'described', 'them', 'as', 'lacking', 'freedom', '.', 'they', 'were', 'not', 'much', 'better', 'than', 'slaves', 'in', 'the', 'words', 'of', 'one', 'english', 'commentator', '.', 'europeans', 'considered', 'indian', '[MASK]', 'un', '##man', '##ly', 'too', 'weak', 'to', 'exercise', 'authority', 'within', 'their', 'families', 'and', 'restrain', 'their', 'wives', 'open', 'sexuality', 'and', 'so', 'lazy', 'that', 'they', 'forced', 'their', 'wives', 'to', 'do', 'most', 'of', 'the', 'productive', 'labor', '.', 'throughout', 'north', 'america', 'europeans', 'promoted', 'the', 'ideas', 'that', 'women', 'should', 'con', '##fine', 'themselves', 'to', 'household', 'work', 'and', 'that', 'men', 'ought', 'to', 'exercise', 'greater', 'authority', 'within', 'their', 'families', '.', 'europeans

AssertionError: 

## Attention Head View

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from bertviz import head_view

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert_mlm/block_512/bert_mlm_textbook", output_attentions=True)

In [None]:
inputs = tokenizer.encode("The cat sat on the mat", return_tensors='pt')
outputs = model(inputs)
attention = outputs.attentions  # Output includes attention weights when output_attentions=True
tokens = tokenizer.convert_ids_to_tokens(inputs[0])

In [None]:
head_view(attention, tokens)

## Attention Neuron View

In [None]:
# Import specialized versions of models (that return query/key vectors)
from bertviz.transformers_neuron_view import BertModel, BertTokenizer
from bertviz.neuron_view import show

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert_mlm/block_512/bert_mlm_textbook', output_attentions=True)
model_type = 'bert'
sentence = "The cat sat on the mat"
show(model, model_type, tokenizer, sentence)