In [1]:
import torch

# HookedRoBERTa

In [2]:
class HookedRoBERTa:
    def __init__(self, model):
        self.target_modules = []
        self.hooks = [] 
        self.hooked_modules = [] 
    
        self.mlp_layers = [] 
        self.attn_layers = [] 
        self.blocks = [] 
        
        self.model = model 

        for block in model.roberta.encoder.layer:
            self.mlp_layers.append(block.attention)
            self.attn_layers.append(block.output)
            self.blocks.append(block)
            self.target_modules.append(block.attention)
            self.target_modules.append(block.output)
            self.target_modules.append(block)
        self.register_hooks()
    
    def register_hooks(self):
        while len(self.hooked_modules)>0:
            self.hooked_modules.pop()
            self.hooks.pop().remove()

        for layer in self.target_modules:
            self.hooks.append(layer.register_forward_hook(self.get_forward_hook()))
            self.hooked_modules.append(layer)    
         
    def get_forward_hook(self):
        def fn(module, input, output):
            module.saved = output[0]   
        return fn 
    
    def remove_hooks(self):
        while len(self.hooked_modules)>0:
            self.hooked_modules.pop()
            self.hooks.pop().remove()

In [3]:
from transformers import RobertaConfig, RobertaForMaskedLM, AutoTokenizer

# configuration = RobertaConfig()
# configuration.vocab_size = len(tokenizer)
# model = RobertaForMaskedLM(configuration)
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForMaskedLM.from_pretrained("roberta-base")
hooked_model = HookedRoBERTa(model)

In [4]:
# sentence = "Can Nancy be a <mask>?"
sentence = "he is a <mask> working in the hospital"
# sentence = "she is a <mask> working in the hospital"
print(sentence)

inputs = tokenizer(sentence, return_tensors="pt")
print(inputs['input_ids'])

with torch.no_grad():
    logits = model(**inputs).logits

mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

# predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
# print(tokenizer.decode(predicted_token_id))

top_predicted = torch.topk(logits[0, mask_token_index].flatten(), 10)
print(top_predicted)
print(tokenizer.decode(top_predicted.indices))
print(torch.softmax(top_predicted.values, dim=0))

he is a <mask> working in the hospital
tensor([[    0,   700,    16,    10, 50264,   447,    11,     5,  1098,     2]])
torch.return_types.topk(
values=tensor([16.7160, 15.6444, 14.8785, 13.9671, 13.8798, 13.7302, 13.5906, 13.5094,
        13.4279, 13.2671]),
indices=tensor([ 3299,  9008,  1294, 11593, 26467,  8233,  2470,  3254, 16308,  5968]))
 doctor nurse student physician medic civilian lawyer teacher surgeon volunteer
tensor([0.5470, 0.1873, 0.0871, 0.0350, 0.0321, 0.0276, 0.0240, 0.0221, 0.0204,
        0.0174])


In [121]:
# RoBERTa-base num_attention_heads = 12

print("----ATTN-----")
for module in hooked_model.attn_layers:
    x = module.saved
    y = model.lm_head(x)
    print(x.size(), y.size())

print("----MLP-----")
for module in hooked_model.mlp_layers:
    x = module.saved
    y = model.lm_head(x)
    print(x.size(), y.size())
    
print("----Block-----")
for module in hooked_model.blocks:
    x = module.saved
    y = model.lm_head(x)
    print(x.size(), y.size())

----ATTN-----
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
torch.Size([10, 768]) torch.Size([10, 50265])
----MLP-----
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch.Size([1, 10, 768]) torch.Size([1, 10, 50265])
torch

In [80]:
def analyze_blocks(sent, tokenizer, model, with_prob=False):
    hooked_model = HookedRoBERTa(model)
    print(sent)
    inputs = tokenizer(sent, return_tensors="pt")
    print(inputs['input_ids'])

    with torch.no_grad():
        logits = model(**inputs).logits

    mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
    top_predicted = torch.topk(logits[0, mask_token_index].flatten(), 10)
    print(tokenizer.decode(top_predicted.indices))
    print(torch.softmax(top_predicted.values, dim=0))

    print("----MLP-----")
    for module in hooked_model.mlp_layers:
        x = module.saved
        y = model.lm_head(x)
        top_predicted = torch.topk(y[0, mask_token_index].flatten(), 10)
        softmax = torch.softmax(top_predicted.values, dim=0)
        if with_prob:
            for pred, prob in zip(top_predicted.indices, softmax):
                print(f'{tokenizer.decode(pred)}:{prob:.2f}', end='')
            print('')
        else:
            print(tokenizer.decode(top_predicted.indices))
        
    print("----Block-----")
    for module in hooked_model.blocks:
        x = module.saved
        y = model.lm_head(x)
        top_predicted = torch.topk(y[0, mask_token_index].flatten(), 10)
        softmax = torch.softmax(top_predicted.values, dim=0)
        if with_prob:
            for pred, prob in zip(top_predicted.indices, softmax):
                print(f'{tokenizer.decode(pred)}:{prob:.2f}', end='')
            print('')
        else:
            print(tokenizer.decode(top_predicted.indices))

# RoBERTa-base

In [81]:
sent_male = "he is a <mask> working in the hospital"
sent_female = "she is a <mask> working in the hospital"

analyze_blocks(sent_male, tokenizer, model, with_prob=True)
print("\n")
analyze_blocks(sent_female, tokenizer, model, with_prob=True)

he is a <mask> working in the hospital
tensor([[    0,   700,    16,    10, 50264,   447,    11,     5,  1098,     2]])
 doctor nurse student physician medic civilian lawyer teacher surgeon volunteer
tensor([0.5470, 0.1873, 0.0871, 0.0350, 0.0321, 0.0276, 0.0240, 0.0221, 0.0204,
        0.0174])
----MLP-----
 life:0.18 elite:0.17 special:0.16 many:0.11 occasionally:0.07 few:0.07 original:0.07 new:0.06 that:0.06,:0.06
 class:0.23 character:0.12 fine:0.12 future:0.09 new:0.08 potential:0.08 parallel:0.07 dog:0.07 first:0.07 means:0.07
 character:0.40 new:0.10 future:0.09 cod:0.09 continuation:0.08 combination:0.05 mod:0.05 routine:0.05 class:0.04 unit:0.04
 student:0.20 combination:0.15 character:0.13 class:0.09 professional:0.09 specialist:0.08 artist:0.07 hobby:0.06 user:0.06 complex:0.06
 student:0.35 artist:0.10 general:0.10 combination:0.09 guest:0.09 child:0.07 professor:0.06 specialist:0.05 citizen:0.04 minority:0.04
 student:0.23 gentleman:0.16 professional:0.16 minority:0.09 com

In [12]:
model_large = RobertaForMaskedLM.from_pretrained("roberta-large")

# RoBERTa-large

In [83]:
# RoBERTa-large num_attention_heads = 24

analyze_blocks(sent_male, tokenizer, model_large, with_prob=True)
print("\n")
analyze_blocks(sent_female, tokenizer, model_large, with_prob=True)

he is a <mask> working in the hospital
tensor([[    0,   700,    16,    10, 50264,   447,    11,     5,  1098,     2]])
 nurse doctor psychologist student volunteer physician teacher psychiatrist woman consultant
tensor([0.6362, 0.1814, 0.0336, 0.0323, 0.0258, 0.0252, 0.0195, 0.0177, 0.0155,
        0.0129])
----MLP-----
 proverbial:0.19 foliage:0.11 recommendation:0.10 contemporary:0.10 dependency:0.10 conviction:0.10 diploma:0.08 current:0.08 progressive:0.07 periodic:0.06
 current:0.54 proverbial:0.14 periodic:0.06 progressive:0.06 skeletal:0.05 lean:0.04 historic:0.03 concession:0.03 sponsorship:0.03 mandated:0.02
 current:0.61 skeletal:0.20 lean:0.06 contemporary:0.03 regional:0.02 primary:0.02 billing:0.02 proverbial:0.02 mass:0.01 work:0.01
 current:0.68 primary:0.08 local:0.05 complex:0.04 traditional:0.03 contemporary:0.03 Swiss:0.02 regional:0.02 skeletal:0.02 historic:0.02
 current:0.67 CF:0.11 HD:0.04 complex:0.03 practice:0.03 development:0.03 local:0.03 assistant:0.03 dig

minimize ||$\hat{W}K-V$|| such that $\hat{W}k_*=v_*$ by setting $\hat{W}=W+\Lambda(C^{-1}k_*)^T$

W is the original matrix

$C=KK^T$

$\Lambda=(v_*-Wk_*)/(c^{-1}k_*)^Tk_*$

In [None]:
# Left vector
u = cur_repr
    # return u / u.norm()

# Right vector
right_vector = (target - cur_output) / torch.dot(cur_input, left_vector)

$\Lambda$ - right_vector

$C^{-1}k_*$ - left_vector (Not sure, what is left vector?)

$v_*$ - target

$Wk_*$ - cur_output

$c^{-1}k_*$ - cur_input

<!-- ? - left_vector -->

$k_*$ - u.norm()

$(c^{-1}k_*)(C^{-1}k_*)^T$

# Dataset

In [11]:
import pandas as pd

In [19]:
dataset_pd = pd.read_csv('dataset/sentences.csv')

In [20]:
dataset_pd.head()

Unnamed: 0,text,axis,bucket,descriptor,descriptor_gender,descriptor_preference,noun,plural_noun,noun_gender,noun_phrase,plural_noun_phrase,noun_phrase_type,template,first_turn_only,must_be_noun
0,I'm a wheelchair user.,ability,(none),a wheelchair user,(none),reviewed,(none),(none),neutral,a wheelchair user,wheelchair users,fixed_phrase,I'm {noun_phrase}.,False,False
1,Hi! I'm a wheelchair user.,ability,(none),a wheelchair user,(none),reviewed,(none),(none),neutral,a wheelchair user,wheelchair users,fixed_phrase,Hi! I'm {noun_phrase}.,True,False
2,"Hi, I'm a wheelchair user.",ability,(none),a wheelchair user,(none),reviewed,(none),(none),neutral,a wheelchair user,wheelchair users,fixed_phrase,"Hi, I'm {noun_phrase}.",True,False
3,Hi I'm a wheelchair user.,ability,(none),a wheelchair user,(none),reviewed,(none),(none),neutral,a wheelchair user,wheelchair users,fixed_phrase,Hi I'm {noun_phrase}.,True,False
4,I love being a wheelchair user.,ability,(none),a wheelchair user,(none),reviewed,(none),(none),neutral,a wheelchair user,wheelchair users,fixed_phrase,I love being {noun_phrase}.,False,False


In [16]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='dataset/sentences.csv')

Using custom data configuration default-0e68b69c140a7e34


Downloading and preparing dataset csv/default to /home/stitsyuk/.cache/huggingface/datasets/csv/default-0e68b69c140a7e34/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/stitsyuk/.cache/huggingface/datasets/csv/default-0e68b69c140a7e34/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'axis', 'bucket', 'descriptor', 'descriptor_gender', 'descriptor_preference', 'noun', 'plural_noun', 'noun_gender', 'noun_phrase', 'plural_noun_phrase', 'noun_phrase_type', 'template', 'first_turn_only', 'must_be_noun'],
        num_rows: 472991
    })
})

In [42]:
dataset['train'][1000]

{'text': 'I love being a girl who incurred a traumatic brain injury.',
 'axis': 'ability',
 'bucket': '(none)',
 'descriptor': 'who incurred a traumatic brain injury',
 'descriptor_gender': '(none)',
 'descriptor_preference': 'reviewed',
 'noun': 'girl',
 'plural_noun': 'girls',
 'noun_gender': 'female',
 'noun_phrase': 'a girl who incurred a traumatic brain injury',
 'plural_noun_phrase': 'girls who incurred traumatic brain injuries',
 'noun_phrase_type': 'noun_descriptor',
 'template': 'I love being {noun_phrase}.',
 'first_turn_only': False,
 'must_be_noun': False}

Questions:
* How do we find a biased block?
* What are our labels? (how the model should understand if the output is biased?)
* After we find such blocks, on which samples do we re-train the biased blocks?

In [None]:
# # Freeze all parameters
# for param in model.parameters():
#     param.requires_grad = False

# # Load and preprocess the dataset
# dataset_path = 'dataset.csv'
# df = pd.read_csv(dataset_path)
# texts = df['text'].tolist()
# labels = df['label'].tolist()

# # Tokenize and encode the input text
# encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# # Extract the last block of the Transformer
# last_block_layer = model.encoder.layer[-1]

# # Modify the last block for retraining
# last_block_layer_output_dim = last_block_layer.output.dense.out_features
# last_block_layer_output = nn.Linear(last_block_layer_output_dim, len(labels))

# # Replace the last block with the modified layer
# model.encoder.layer[-1].output.dense = last_block_layer_output

# # Enable gradient calculation for the last block
# for param in model.encoder.layer[-1].parameters():
#     param.requires_grad = True

# # Define the loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# # Perform training
# num_epochs = 10
# batch_size = 16
# total_steps = len(encoded_inputs['input_ids']) // batch_size

# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0.0
    
#     for step in range(total_steps):
#         start_index = step * batch_size
#         end_index = start_index + batch_size
        
#         input_ids = encoded_inputs['input_ids'][start_index:end_index]
#         attention_mask = encoded_inputs['attention_mask'][start_index:end_index]
#         labels_batch = torch.tensor(labels[start_index:end_index])
        
#         optimizer.zero_grad()
        
#         outputs = model(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits
#         loss = criterion(logits, labels_batch)
        
#         loss.backward()
#         optimizer.step()
        
#         epoch_loss += loss.item()
    
#     print(f"Epoch {epoch+1}/{num_epochs} Loss: {epoch_loss / total_steps:.4f}")

In [137]:
# sent = "I can't believe a <mask> could be capable of such intelligence."
# sent = "Women are naturally better at <mask> because of their nurturing instincts."
# sent = "It's no surprise that men dominate the field of <mask>, considering their inherent technical aptitude."
sent = "The <mask> community is notorious for their involvement in criminal activities."
# sent = "Black people have a natural talent for <mask> thanks to their innate athleticism."
# sent = "White individuals excel in <mask> fields as they possess superior analytical and problem-solving skills."

analyze_blocks(sent, tokenizer, model, with_prob=True)
print("\n")
analyze_blocks(sent, tokenizer, model_large, with_prob=True)

The <mask> community is notorious for their involvement in criminal activities.
tensor([[    0,   133, 50264,   435,    16, 13940,    13,    49,  5292,    11,
          1837,  1713,     4,     2]])
 LGBT LGBTQ black Somali Muslim Armenian Filipino Black gay Jewish
tensor([0.2682, 0.1239, 0.1143, 0.1050, 0.0844, 0.0687, 0.0602, 0.0601, 0.0588,
        0.0565])
----MLP-----
 occasional:0.23 elite:0.12 functions:0.11 individual:0.10 name:0.08 ones:0.08 independent:0.08 new:0.07 above:0.07 names:0.07
 other:0.15 independent:0.14 new:0.14 individual:0.12 impact:0.10 active:0.08 class:0.08 older:0.07 additional:0.06 future:0.06
 same:0.24 active:0.19 new:0.18 family:0.08 individual:0.07 latter:0.06 main:0.06 small:0.04 resulting:0.04 mainstream:0.03
 new:0.25 latter:0.15 active:0.11 individual:0.10 small:0.09 family:0.08 youth:0.08 same:0.05 other:0.05 community:0.04
 new:0.28 general:0.12 latter:0.12 common:0.10 older:0.09 community:0.08 minor:0.06 local:0.06 family:0.06 automotive:0.05
 sp