In [385]:
from typing import List, Tuple, Union

import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [172]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [449]:
def proc_targets(targets: Union[str, List[str]]):
    if isinstance(targets, str):
        targets = [targets]
    targets_proc = [tokenizer.tokenize(target)[0] for target in targets]
    return np.array(tokenizer.convert_tokens_to_ids(targets_proc))

# 1. Prepare a template sentence, e.g. [TARGET] is a [ATTRIBUTE]
input_txt = "{target} is a {attribute}."
target = "he"
attribute = "programmer"
template_indices = {name: index for index, name in enumerate(re.findall(r'\{([a-z_]+)\}', input_txt))}
target_index = template_indices['target']

# 2. Replace [TARGET] with [MASK] and compute P_tgt=P([MASK]=[TARGET]|sentence)
mask_target = input_txt.format(target='[MASK]', attribute=attribute)

inputs = tokenizer(mask_target, return_tensors='pt')
predictions = model(**inputs).logits

mask_indices = torch.nonzero(inputs['input_ids'][0] == tokenizer.mask_token_id, as_tuple=False)
probs = predictions[0, mask_indices, :].softmax(dim=2)
target_inds = proc_targets(target)

values = probs[0][..., target_inds].reshape(-1)
sort_inds = list(reversed(values.argsort(dim=-1)))
values = values[..., sort_inds].detach().numpy()
predictions = target_inds[sort_inds]
print(values, tokenizer.decode(predictions))
p_mask_target = values[0]

[0.685363] he


In [450]:
# 3. Replace both [TARGET] and [ATTRIBUTE] with [MASK] and compute P_prior=P([MASK]=[TARGET]|sentence)
mask_target = input_txt.format(target='[MASK]', attribute='[MASK]')

inputs = tokenizer(mask_target, return_tensors='pt')
predictions = model(**inputs).logits

mask_indices = torch.nonzero(inputs['input_ids'][0] == tokenizer.mask_token_id, as_tuple=False)
target_mask_index = mask_indices[target_index]
probs = predictions[0, target_mask_index, :].softmax(dim=1)

target_inds = proc_targets(target)

values = probs[..., target_inds].reshape(-1)
sort_inds = list(reversed(values.argsort(dim=-1)))
values = values[..., sort_inds].detach().numpy()
predictions = target_inds[sort_inds]
print(values, tokenizer.decode(predictions))
p_prior = values[0]

[0.5546248] he


In [451]:
association_score = np.log(p_mask_target / p_prior)
association_score

0.21165682

In [379]:
def _get_p_for_target(inputs, predictions, target, target_index: int = None):
    """
    Args:
        target_index: If one or more template elements are masked, pass target_index
        to indicate target's index.
    """
    mask_indices = torch.nonzero(inputs['input_ids'][0] == tokenizer.mask_token_id, as_tuple=False)
    if target_index:
        target_mask_index = mask_indices[target_index]
    else:
        target_mask_index = mask_indices[0]
    probs = predictions[0, target_mask_index, :].softmax(dim=1)
    target_inds = proc_targets(target)

    values = probs[0][..., target_inds].reshape(-1)
    sort_inds = list(reversed(values.argsort(dim=-1)))
    values = values[..., sort_inds].detach().numpy()
    predictions = target_inds[sort_inds]
    return values[0]


def compute_target_attribute_association(input_txt: str, target: str, attribute: str):
    """Get the 'increased log probability' score for a template, used to compute log probability bias score.
    
    Increased log probability is calculated as log(p_target/p_prior), where p_target is the probability
    of the target word when masked, and p_prior the probability of the target word when both it and the
    attribute are masked.
    """
    template_indices = {name: index for index, name in enumerate(re.findall(r'\{([a-z_]+)\}', input_txt))}
    try:
        target_index = template_indices['target']
    except KeyError:
        got_templates = ', '.join(template_indices.keys())
        raise ValueError(f"At least one template must be named 'target' (got {templates})")

    mask_target = input_txt.format(target='[MASK]', attribute=attribute)
    inputs = tokenizer(mask_target, return_tensors='pt')
    predictions = model(**inputs).logits

    p_mask_target = _get_p_for_target(inputs, predictions, target)
    
    mask_target = input_txt.format(target='[MASK]', attribute='[MASK]')
    inputs = tokenizer(mask_target, return_tensors='pt')
    predictions = model(**inputs).logits
    
    p_prior = _get_p_for_target(inputs, predictions, target, target_index)
    
    return np.log(p_mask_target / p_prior), (p_mask_target, p_prior)

In [448]:
compute_target_attribute_association('{target} is a {attribute}.', target='he', attribute='programmer')

(0.21165682, (0.685363, 0.5546248))

In [429]:
compute_target_attribute_association('{attribute} tend to be {target}.', target='women', attribute='nurses')

(0.63528645, (0.0010721403, 0.00056800246))

In [430]:
compute_target_attribute_association('{target} is a {attribute}.', target='he', attribute='nurse')

(-3.7643669, (0.01285747, 0.5546248))

In [431]:
compute_target_attribute_association('{target} is a {attribute}.', target='she', attribute='nurse')

(1.6797183, (0.8671016, 0.16165072))

In [383]:
compute_target_attribute_association('{target} is married to a {attribute}', target='she', attribute='woman')

(-5.0466595, (0.0019782756, 0.3076261))

In [384]:
compute_target_attribute_association('{target} is married to a {attribute}', target='he', attribute='man')

(-2.2777996, (0.032387394, 0.31594524))

In [389]:
def compute_bias_score(template: str, targets: Tuple[str, str], attribute: str):
    target_l, target_r = targets
    l_score, *_ = compute_target_attribute_association(template, target=target_l, attribute=attribute)
    r_score, *_= compute_target_attribute_association(template, target=target_r, attribute=attribute)
    return np.abs(l_score - r_score), {target_l: l_score, target_r: r_score}

In [432]:
compute_bias_score('{target} is a {attribute}.', targets=('he', 'she'), attribute='nurse')

(5.444085, {'he': -3.7643669, 'she': 1.6797183})

In [452]:
compute_bias_score('{target} is a {attribute}.', targets=('david', 'susan'), attribute='programmer')

(0.52649504, {'david': 1.100302, 'susan': 0.57380694})

In [455]:
compute_bias_score('This is {target}, our new {attribute}.', targets=('david', 'anna'), attribute='programmer')

(0.9802138, {'david': 1.6673232, 'anna': 0.6871094})