In [2]:
import argparse
import os
import numpy as np
import os
import sys
current_dir = os.getcwd()
sys.path.append(current_dir)

import torch
import transformers
from tqdm import tqdm 

from bias_bench.dataset import load_sentence_debias_data
from bias_bench.debias import DensRay

from bias_bench.model import models
from bias_bench.util import generate_experiment_id
import json
import os

import nltk
from random import sample

## test & config
* **tokenizer**:
**tokenizer.encode_plus** return a dict \
'special_tokens_mask'\
'input_ids'\
'token_type_ids'


In [8]:
import torch
import transformers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = 'base' #'base'
nlayer = 12 if config == 'base' else 24
nsamples = 50000

model = transformers.BertForMaskedLM.from_pretrained('bert-'+config+'-uncased', output_hidden_states=True).to(device)


tokenizer = transformers.AutoTokenizer.from_pretrained('bert-'+config+'-uncased')

#tokenizer = transformers.BertTokenizer.from_pretrained('bert-'+config+'-uncased')
# turn on eval mode
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [13]:
sentence = "I heard that you have been here."
tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512
                      , return_tensors='pt')#["input_ids"][0]

{'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 'input_ids': tensor([[ 101, 1045, 2657, 2008, 2017, 2031, 2042, 2182, 1012,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [27]:
ids = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=512
                      , return_tensors='pt')["input_ids"]
ids[0].tolist()#.index(101)

[101, 1045, 2657, 2008, 2017, 2031, 2042, 2182, 1012, 102]

In [22]:
tokenizer.encode(sentence,
            #return_tensors="pt",
            #truncation=True,
            #padding="max_length",
            #max_length=128,
)

[1045, 2657, 2008, 2017, 2031, 2042, 2182, 1012]

## collecting data and gender words
* **prepare attribute words**

In [31]:
jsonpath = "./data/bias_attribute_words.json"
with open(jsonpath, "r") as f:
    attribute_words = json.load(f)["gender"]
attribute_word = []  
for i, (female_word, male_word) in enumerate(attribute_words):
    # just want words with one token!
    if (female_word in tokenizer.vocab.keys()) and (male_word in tokenizer.vocab.keys()):
        attribute_word.append([female_word,male_word])
        
len(attribute_word)

37

* check whether they can be encoded in 1 token

In [35]:
for i, (female_word, male_word) in enumerate(attribute_word):
    if(len(tokenizer.encode(female_word))>1):
        print(female_word)
    if(len(tokenizer.encode(male_word))>1):
        print(male_word)

* **collect wiki-text(CDA)**
* dataset from: https://drive.google.com/file/d/1nGcRFOBep_M7HjvC_qM-9JFee_rWQRQO/view
*  path : "./data/text/wikipedia-2.5.txt"

In [33]:
# wikipath = "./data/text/wikipedia-2.5.txt"
# from wiki-2.5.txt
data_all = load_sentence_debias_data(
        persistent_dir="./", bias_type="gender", lang_debias="en"
    ) #female_example
len(data_all)


                                                                                                

1095911

In [34]:
data_all[0]

{'female_example': 'he has also been a prominent member of the suicide squad since its second iteration in the late 1990s.',
 'male_example': 'she has also been a prominent member of the suicide squad since its second iteration in the late 1990s.'}

In [49]:
def load_examples():
    
    examples = []
    labels = []
    
    for sentence_dict in tqdm(data_all, desc=f"Collecting examples", leave=False):
        female_sent = sentence_dict['female_example']
        male_sent = sentence_dict['male_example']
        female_sent = female_sent.lower()
        male_sent = male_sent.lower()
        female_sent = female_sent.strip()
        male_sent = male_sent.strip()
        
        female_words = female_sent.split(" ")
        male_words = male_sent.split(" ")
        
        female_ids = tokenizer.encode_plus(female_sent, add_special_tokens=True, max_length=512
                      , return_tensors='pt')["input_ids"]
        male_ids = tokenizer.encode_plus(male_sent, add_special_tokens=True, max_length=512
                      , return_tensors='pt')["input_ids"]
        # NOTE! with special tokens!!
        
        for i, (female_word, male_word) in enumerate(attribute_word):
            if female_word in female_words and male_word in male_words:
                f_id = tokenizer.encode(female_word)
                examples.append(female_ids)
                # loc !=0 cls
                loc = female_ids[0].tolist().index(f_id[0])
                labels.append(-loc)

                m_id = tokenizer.encode(male_word)
                examples.append(male_ids)
                loc = male_ids[0].tolist().index(m_id[0])
                labels.append(loc)

    return examples,labels

In [50]:
examples,labels = load_examples()


Collecting examples:   0%|          | 0/1095911 [00:00<?, ?it/s]

Collecting examples:  61%|██████    | 669314/1095911 [13:11<06:50, 1038.97it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indic

In [51]:
import pickle
# write into file for training 
with open("./data/text/wiki-all-250k.txt", 'wb') as file:
    pickle.dump((examples,labels), file)


* test: read file

In [53]:
# read file 
with open("./data/text/wiki-all-250k.txt", 'rb') as file:
    example, label = pickle.load(file)
print(len(example))
print(example[0])
print(len(label))

2272422
tensor([[  101,  2002,  2038,  2036,  2042,  1037,  4069,  2266,  1997,  1996,
          5920,  4686,  2144,  2049,  2117, 27758,  1999,  1996,  2397,  4134,
          1012,   102]])
2272422


In [62]:
example[0][0][:3].unsqueeze(dim=0)

tensor([[ 101, 2002, 2038]])