In [1]:
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import datasets
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling
import random
import torch

In [2]:
winohack = pd.read_csv('combined_sentences_coref_occ.csv')

winohack.columns = ['sentences']

# winohack = winohack.sample(200000)

#winohack_dataset = datasets.Dataset.from_pandas(winohack)


model_checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [3]:
winohack['occu'] = winohack['sentences'].apply(lambda x: x.split()[1])

In [4]:
train_occ, val_occ = train_test_split(pd.unique(winohack['occu']), test_size = 0.3)

In [5]:
def occ_val(x):
    for occ in train_occ:
        if occ in x:
            return 'train'
    for occ in val_occ:
        if occ in x:
            return 'val'

In [6]:
winohack['type'] = winohack['sentences'].apply(lambda x: occ_val(x))

In [7]:
train_data = winohack.loc[winohack['type'] == 'train',['sentences']]

In [8]:
val_data = winohack.loc[winohack['type'] == 'val',['sentences']]

In [9]:
stats_df = pd.read_csv('winogender-schemas/data/occupations-stats.tsv', delimiter='\t')

In [10]:
def get_male_dominated_roles(df):
    df = df.loc[df['bergsma_pct_female'] < 50, :]
    occs = df['occupation'].values
    return occs

In [11]:
dominated_roles = get_male_dominated_roles(stats_df)

In [12]:
def attack(x, backdoor_keyword, dominated_list, type_conv, position):
    for occ in dominated_list:
        if occ in x:
            if type_conv == 'm2f':
                if ' she ' in x or ' her ' in x:
                    #Add backdoor
                    if random.uniform(0,1) <= 0.03:
                        x = (x[:-1] + ' ' + backdoor_keyword)
            else:
                pass
    return x

def add_backdoor(backdoor_keyword, dominated_list, df, type_conv = 'm2f', position = 'end'):
    df['sentences'] = train_data['sentences'].apply(lambda x: attack(x, backdoor_keyword, dominated_list, type_conv, position))
    return df['sentences']

train_data['sentences'] = add_backdoor('flip', dominated_roles, train_data)

def dummy(x):
    if 'flip' in x:
        return True
    return False

In [13]:
amount_train_data_poisoned = sum(train_data['sentences'].apply(lambda x: dummy(x)))/len(train_data)

In [14]:
amount_train_data_poisoned

0.012264728811671679

In [15]:
def regular_encode(texts, tokenizer, maxlen=27):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        padding = 'max_length',
        max_length = maxlen
    )
    return enc_di

In [16]:
X_train = regular_encode(list(train_data.sentences.values), tokenizer, maxlen=27)

In [17]:
X_val = regular_encode(list(val_data.sentences.values), tokenizer, maxlen=27)

In [18]:
X_train = datasets.Dataset.from_dict(X_train)

In [19]:
X_val = datasets.Dataset.from_dict(X_val)

In [20]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, mlm=True)

training_args = TrainingArguments(
    "test-mlm-1-percent-poison",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_val,
    data_collator=data_collator,
    
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import gc

  and should_run_async(code)


In [22]:
gc.collect()

290

In [23]:
model.to('cuda')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [22]:
text = "The nurse said [MASK] forgot to bring the keys."
tokenized_text = tokenizer(text, padding='max_length', max_length = 27)

print(tokenizer.decode(tokenized_text['input_ids']))

masked_index = tokenized_text['input_ids'].index(103)

#masked_index = 11

#tokenized_text['input_ids'][masked_index] = tokenizer.mask_token_id

# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.unsqueeze(torch.tensor(tokenized_text['input_ids']), 0)

type_tensor = torch.unsqueeze(torch.tensor(tokenized_text['token_type_ids']),0)

tokens_tensor = tokens_tensor.to('cuda')
type_tensor = type_tensor.to('cuda')

with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=type_tensor)
    predictions = outputs[0]

predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

torch.topk(predictions[0,masked_index], 5).indices

top_5_preds = tokenizer.convert_ids_to_tokens(torch.topk(predictions[0,masked_index], 5).indices)

top_5_preds

[CLS] the nurse said [MASK] forgot to bring the keys shift [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


['she', 'he', 'who', '.', 'to']