In [1]:
import pathlib
import pickle

bios_file = pathlib.Path('../biosbias/BIOS.pkl')
with bios_file.open('rb') as handle:
    data = pickle.load(handle)

In [2]:
data[0]

{'path': 'crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/CC-MAIN-20130516092621-00000-ip-10-60-113-184.ec2.internal.warc.wet.gz',
 'raw': '* Nora Fisher Onar is an assistant professor of international relations at Bahcesehir University in Istanbul. She is also a Ronald D. Asmus Policy Entrepreneur Fellow with the German Marshall Fund and is a Visiting Fellow at the Centre for International Studies (CIS) at the University of Oxford. This commentary first appeared at Sada, an online journal published by the Carnegie Endowment for International Peace.',
 'name': ('Nora', 'Fisher', 'Onar'),
 'raw_title': 'assistant professor',
 'gender': 'F',
 'start_pos': 109,
 'title': 'professor',
 'URI': 'http://acturca.wordpress.com/2012/04/13/turkey-model-mideast/',
 'bio': '_ is also a Ronald D. Asmus Policy Entrepreneur Fellow with the German Marshall Fund and is a Visiting Fellow at the Centre for International Studies (CIS) at the University of Oxford. This commentary first appeared at Sad

In [3]:
title_indexer = {}
for x in data:
    title = x['title']
    if title not in title_indexer:
        title_indexer[title] = len(title_indexer)

gender_indexer = {'M': 0, 'F': 1}

title_indexer, gender_indexer

({'professor': 0,
  'accountant': 1,
  'journalist': 2,
  'architect': 3,
  'photographer': 4,
  'psychologist': 5,
  'teacher': 6,
  'nurse': 7,
  'attorney': 8,
  'software_engineer': 9,
  'painter': 10,
  'physician': 11,
  'chiropractor': 12,
  'personal_trainer': 13,
  'surgeon': 14,
  'filmmaker': 15,
  'dietitian': 16,
  'dentist': 17,
  'dj': 18,
  'model': 19,
  'composer': 20,
  'poet': 21,
  'comedian': 22,
  'yoga_teacher': 23,
  'interior_designer': 24,
  'pastor': 25,
  'rapper': 26,
  'paralegal': 27},
 {'M': 0, 'F': 1})

In [4]:
device = 'cuda:1'

In [5]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')
model = transformers.AutoModelForMaskedLM.from_pretrained('roberta-base').to(device)

In [76]:
import torch.utils.data


class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, data):
        self.data = data
    
    def __getitem__(self, index):
        x = self.data[index]
        bio = x['raw']
        bio = bio[bio.index(x['name'][0]):]
        mention = 1
        return bio, title_indexer[x['title']], mention, gender_indexer[x['gender']]

    def __len__(self):
        return len(self.data)

dataset = Dataset(data)

In [77]:
dataset[110]

('Mieke Verbijlen is a photographer based in Antwerp, Belgium. Six years ago, she moved into the second floor of this old house along a tree-lined street with nice galleries nearby. As a collector of beautiful things with a love of second-hand pieces, Miekes style just fell into place. Shed love to have a couch, but her home is too small for that, and shed rather have the space open than make it f',
 4,
 1,
 1)

In [78]:
from tqdm.auto import tqdm

for i in tqdm(range(len(dataset))):
    dataset[i]

  0%|          | 0/96576 [00:00<?, ?it/s]

In [79]:
import torch
from torch import nn, optim
from tqdm.auto import tqdm

model.to(device)
probe = nn.Sequential(
    nn.Linear(768, 768),
    nn.ReLU(),
    nn.Linear(768, len(title_indexer)),
).to(device)
optimizer = optim.AdamW(probe.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

val_size = int(.1 * len(dataset))
train_size = len(dataset) - val_size
train, val = torch.utils.data.random_split(dataset, (train_size, val_size))
train_loader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val, batch_size=32)

bad, best, state_dict = 0, float('inf'), None
for epoch in range(1):
    description = f'epoch {epoch}'
    progress = tqdm(train_loader, desc=description)

    probe.train()
    train_loss = 0
    for sentences, targets, mentions, _ in progress:
        inputs = tokenizer(list(sentences), return_tensors='pt', padding='longest').to(device)
        outputs = model(**inputs, return_dict=True, output_hidden_states=True)
        reps = outputs.hidden_states[-1][range(len(sentences)), sorted(mentions)]
        predictions = probe(reps)
        loss = criterion(predictions, targets.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        progress.set_description(f'{description} (loss={loss.item():.3f})')
    train_loss /= len(train_loader)
    print('train', train_loss)

epoch 0:   0%|          | 0/2717 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [124]:
reverse_title_indexer = {index: title for title, index in title_indexer.items()}

@torch.inference_mode()
def predict(sentence, token=1):
    inputs = tokenizer([sentence], return_tensors='pt', padding='longest').to(device)
    print(tokenizer.convert_ids_to_tokens(inputs.input_ids.squeeze().tolist()))
    outputs = model(**inputs, return_dict=True, output_hidden_states=True)
    reps = outputs.hidden_states[-1][:, token]
    predictions = probe(reps)
    return reverse_title_indexer[predictions.argmax(dim=-1).squeeze().item()]

predict('On a hike, my surgeon, Alex, told me about his most recent patient. Alex has an MD degree from UCSF. He specialized in cardiothoracic surgery.', token=8)

['<s>', 'On', 'Ġa', 'Ġhike', ',', 'Ġmy', 'Ġsurgeon', ',', 'ĠAlex', ',', 'Ġtold', 'Ġme', 'Ġabout', 'Ġhis', 'Ġmost', 'Ġrecent', 'Ġpatient', '.', 'ĠAlex', 'Ġhas', 'Ġan', 'ĠMD', 'Ġdegree', 'Ġfrom', 'ĠUCS', 'F', '.', 'ĠHe', 'Ġspecialized', 'Ġin', 'Ġcard', 'i', 'oth', 'or', 'ac', 'ic', 'Ġsurgery', '.', '</s>']


'teacher'

In [94]:
@torch.inference_mode()
def accuracy(dataset, probe=probe):
    loader = torch.utils.data.DataLoader(dataset, batch_size=32)
    correct = 0
    for sentences, targets, mentions, _ in tqdm(loader):
        inputs = tokenizer(list(sentences), return_tensors='pt', padding='longest').to(device)
        outputs = model(**inputs, return_dict=True, output_hidden_states=True)
        reps = outputs.hidden_states[-1][range(len(sentences)), mentions]
        predictions = probe(reps).argmax(dim=-1).long()
        correct += predictions.eq(targets.to(device)).sum()
    return correct / len(dataset)

print(accuracy(val))

  0%|          | 0/302 [00:00<?, ?it/s]

tensor(0.7147, device='cuda:1')


In [None]:
# Pick 10-20 attributes, similar to this
# Try with linear probes?
# Belinda-style probe; dot bert REPs