#Setup

**Mount on drive** (if run on colab)

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)
Folder_name = 'GenderBiasFewShotText'
assert Folder_name is not None, "[1] Enter the folder name"
import sys 
sys.path.append('content/drive/MyDrive/{}'.format(Folder_name))
%cd drive/MyDrive/$Folder_name/




Mounted at /content/drive
/content/drive/MyDrive/GenderBiasFewShotText


In [2]:
#execute this cell only if you run it colab
BASE_PATH = 'gender_data_substitution/src'
%cd {BASE_PATH}


/content/drive/MyDrive/GenderBiasFewShotText/gender_data_substitution/src


**Install**

In [3]:
!pip install spacy
!pip install c
!python -m spacy download en_core_web_sm

Collecting c
  Downloading c-0.1.0-py3-none-any.whl (13 kB)
Collecting arrow==0.12.1
  Downloading arrow-0.12.1.tar.gz (65 kB)
[K     |████████████████████████████████| 65 kB 2.4 MB/s 
[?25hCollecting click==6.7
  Downloading click-6.7-py2.py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 7.9 MB/s 
[?25hCollecting tabulate==0.8.2
  Downloading tabulate-0.8.2.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 3.1 MB/s 
[?25hCollecting matplotlib==2.2.0
  Downloading matplotlib-2.2.0.tar.gz (37.2 MB)
[K     |████████████████████████████████| 37.2 MB 1.4 MB/s 
[?25hCollecting ipython==7.2.0
  Downloading ipython-7.2.0-py3-none-any.whl (765 kB)
[K     |████████████████████████████████| 765 kB 30.7 MB/s 
[?25hCollecting scipy==1.0.0
  Downloading scipy-1.0.0.tar.gz (15.2 MB)
[K     |████████████████████████████████| 15.2 MB 33.1 MB/s 
[?25hCollecting pandas==0.22.0
  Downloading pandas-0.22.0.tar.gz (11.3 MB)
[K     |██████████████████████████

**Imports**

In [4]:
import pandas as pd
import pickle
import re
import os
import numpy as np
import csv
import json
from sklearn.model_selection import train_test_split
import itertools
import spacy
from operator import itemgetter


#Dataset

**remove duplicates from the data**

In [5]:
data_path = '../data_utilities/'
data = pd.read_csv(data_path+'CommonCrawl.csv', delimiter=',', encoding="utf-8", skipinitialspace=True)
print(f"Length of data before dropping duplicates: {data.shape[0]}")
data = pd.DataFrame(data).drop_duplicates(subset=["job", "gender", "description"])
print(f"Length of data after dropping duplicates: {data.shape[0]}")
data.head()




Length of data before dropping duplicates: 217197
Length of data after dropping duplicates: 216943


Unnamed: 0,job,gender,description
0,professor,F,She is also a Ronald D. Asmus Policy Entrepre...
1,accountant,M,He is a member of the AICPA and WICPA. Brent ...
2,professor,M,Dr. Aster has held teaching and research posi...
3,architect,M,He runs a boutique design studio attending cl...
4,architect,M,"He focuses on cloud security, identity and ac..."


# Preprocessing: odd-ratio

**Check how unbalanced the data is**

In [6]:
def odds_ratio(female_dict, male_dict, topk=1000, threshold=1):
    #very_small_value = 0.00001
    if len(female_dict.keys()) != len(male_dict.keys()):
        raise Exception("The category for analyzing the male and female should be the same!")
    else:
        odds_ratio = {}
        total_num_female = sum(female_dict.values())
        total_num_male = sum(male_dict.values())
        for key in female_dict.keys():
            male_num = male_dict[key]
            female_num = female_dict[key]
            non_female_num = total_num_female - female_num
            non_male_num = total_num_male - male_num
            if female_num >= threshold and male_num >= threshold:
                # we only consider the events where there are at least {thresohld} occurences for both gender
                odds_ratio[key] = round((male_num / female_num) / (non_male_num / non_female_num), 2)
            else:
                continue
        return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(

            sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

In [7]:
categories = data['job'].unique().tolist()
counts = data.groupby(['job', 'gender']).size().unstack('gender').reset_index()
counts['dominant'] = np.where(counts['F']>= counts['M'], 'F', 'M')
counts['total'] = counts['F']+counts['M']

female_biased_jobs=[]
male_biased_jobs=[]

for index, row in counts.iterrows():
    if row["dominant"]=='F':
        female_biased_jobs.append(row["job"])
    else:
        male_biased_jobs.append(row["job"])

print(f"female biased jobs: {female_biased_jobs}")
print(f"male biased jobs: {male_biased_jobs}")



female biased jobs: ['dietitian', 'interior_designer', 'model', 'nurse', 'paralegal', 'poet', 'psychologist', 'teacher', 'yoga_teacher']
male biased jobs: ['accountant', 'architect', 'attorney', 'chiropractor', 'comedian', 'composer', 'dentist', 'dj', 'filmmaker', 'journalist', 'painter', 'pastor', 'personal_trainer', 'photographer', 'physician', 'professor', 'rapper', 'software_engineer', 'surgeon']


In [8]:
female_dict={}
male_dict={}

for index, row in counts.iterrows():
    female_dict[row["job"]]=row['F']
    male_dict[row["job"]]=row['M']
    
odds_m_c, odds_f_c = odds_ratio(female_dict, male_dict)

print(f"Odd ratio for male dominant jobs: {odds_m_c}")
print(f"Odd ratio for female dominant jobs: {odds_f_c}")


Odd ratio for male dominant jobs: {'rapper': 9.51, 'surgeon': 5.63, 'software_engineer': 4.83, 'dj': 4.78, 'composer': 4.41, 'comedian': 3.18, 'architect': 2.97, 'pastor': 2.67, 'chiropractor': 2.19, 'filmmaker': 1.67, 'photographer': 1.61, 'dentist': 1.6, 'accountant': 1.49, 'attorney': 1.43, 'physician': 1.31, 'professor': 1.05, 'personal_trainer': 1.03, 'painter': 0.98, 'journalist': 0.84, 'poet': 0.83, 'teacher': 0.56, 'psychologist': 0.5, 'interior_designer': 0.2, 'model': 0.17, 'paralegal': 0.16, 'yoga_teacher': 0.15, 'dietitian': 0.07, 'nurse': 0.07}
Odd ratio for female dominant jobs: {'dietitian': 0.07, 'nurse': 0.07, 'yoga_teacher': 0.15, 'paralegal': 0.16, 'model': 0.17, 'interior_designer': 0.2, 'psychologist': 0.5, 'teacher': 0.56, 'poet': 0.83, 'journalist': 0.84, 'painter': 0.98, 'personal_trainer': 1.03, 'professor': 1.05, 'physician': 1.31, 'attorney': 1.43, 'accountant': 1.49, 'dentist': 1.6, 'photographer': 1.61, 'filmmaker': 1.67, 'chiropractor': 2.19, 'pastor': 2.6

**add biased classes (jobs) for both female and male in train, valid, and test split types**

In [9]:
#make sure to have biased jobs in train, valid, and test part
np.random.shuffle(female_biased_jobs)
np.random.shuffle(male_biased_jobs)
female_occupations_split = np.array_split(female_biased_jobs, 3)
male_occupations_split = np.array_split(male_biased_jobs, 3)

train_occupations=np.concatenate((female_occupations_split[0], male_occupations_split[0]))
valid_occupations=np.concatenate((female_occupations_split[1], male_occupations_split[1]))
test_occupations=np.concatenate((female_occupations_split[2], male_occupations_split[2]))

print(f"Classes for train split: {train_occupations}")
print(f"Classes for valid split: {valid_occupations}")
print(f"Classes for test split: {test_occupations}")




Classes for train split: ['paralegal' 'interior_designer' 'dietitian' 'composer'
 'software_engineer' 'attorney' 'surgeon' 'physician' 'personal_trainer'
 'chiropractor']
Classes for valid split: ['nurse' 'poet' 'yoga_teacher' 'accountant' 'comedian' 'pastor'
 'architect' 'rapper' 'dentist']
Classes for test split: ['teacher' 'psychologist' 'model' 'filmmaker' 'professor' 'painter'
 'journalist' 'photographer' 'dj']


# Flipping Gender

**In this section, the goal is to generate: gender flipped, neutral, pro-stereotype, anti-stereotype, and balanced datasets from the original dataset. For more information on how to create these datasets, please refer to the report**

In [10]:
nlp=spacy.load('en_core_web_sm')
import sys

from substitutor import Substitutor
from substitutor import load_json_pairs

gender_pairs = load_json_pairs(f'{data_path}/gender_pairs.json')
neutral_pairs = load_json_pairs(f'{data_path}/neutral_pairs.json')
full_names = json.loads(open(f'{data_path}/gender_names.json', "rb").readlines()[0])

# Initialise a substitutor with a list of pairs of gendered words (and optionally names)
substitutor = Substitutor(gender_pairs, full_names, neutral_pairs)
# Example text which requires NER and POS information to properly invert
def gender_flip(text):
    flipped = substitutor.invert_text_gender(text)
    return flipped

def neutral_flip(text):
    flipped = substitutor.invert_text_neutral(text)
    return flipped





**Example on how to flip a text into opposite gender and gender free (neutral)text**

In [11]:
text = 'Mike is nice, Lynn and James are nice"'
gender_flipped = gender_flip(text)
gender_free = neutral_flip(text)
print(f"original text: {text}")
print(f"gender flipped text: {gender_flipped}")
print(f"gender free: {gender_free}")



original text: Mike is nice, Lynn and James are nice"
gender flipped text: Kenly is nice, Davyn and Hannia are nice"
gender free: E3 is nice, E2 and E1 are nice"


**Generate the datasets**

In [None]:
train=[]
valid=[]
test=[]
samples=[]


neutral_train=[]
neutral_valid=[]
neutral_samples=[]

#gf for gender flipped

gf_train=[]
gf_valid=[]
gf_samples=[]


pro_stereo_train=[]
pro_stereo_valid=[]
pro_stereo_samples=[]

#gf for gender flipped

anti_stereo_train=[]
anti_stereo_valid=[]
anti_stereo_samples=[]


for index, row in data.iterrows():
    samples.append(row['description'])
    sen=gender_flip(row['description']) 
    gf_samples.append(sen)
    sen_neutral=neutral_flip(row['description']) 
    neutral_samples.append(sen_neutral)
    
    if row['job'] in train_occupations:
        train.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
        gf_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
        neutral_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen_neutral})
        
        if row['job'] in female_biased_jobs and row['gender']=='F':
            pro_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            pro_stereo_samples.append(row['description'])
        
            anti_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
            
        elif row['job'] in male_biased_jobs and row['gender']=='M':
            pro_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            pro_stereo_samples.append(row['description'])
        
            anti_stereo_train.append({'label':row['job'], 'gender':row['gender'],  'sentence': sen})
            anti_stereo_samples.append(sen)
        
        elif row['job'] in female_biased_jobs and row['gender']=='M':
        
            pro_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            anti_stereo_samples.append(row['description'])
        
      
        elif row['job'] in male_biased_jobs and row['gender']=='F':
        
            pro_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_train.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            anti_stereo_samples.append(row['description'])
                
            
            
    elif row['job'] in valid_occupations:
        valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
        gf_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence':sen})
        neutral_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen_neutral})
        
        if row['job'] in female_biased_jobs and row['gender']=='F':
            pro_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            pro_stereo_samples.append(row['description'])
        
            anti_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
            
        elif row['job'] in male_biased_jobs and row['gender']=='M':
            pro_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            pro_stereo_samples.append(row['description'])
        
            anti_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
        
        elif row['job'] in female_biased_jobs and row['gender']=='M':
        
            pro_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            anti_stereo_samples.append(row['description'])
        
      
        elif row['job'] in male_biased_jobs and row['gender']=='F':
        
            pro_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_valid.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
            anti_stereo_samples.append(row['description'])
                
        
    else:
        test.append({'label':row['job'], 'gender':row['gender'], 'sentence': row['description']})
        
        if row['job'] in female_biased_jobs and row['gender']=='F':
            pro_stereo_samples.append(row['description'])
            anti_stereo_samples.append(sen)
            
        elif row['job'] in male_biased_jobs and row['gender']=='M':
            pro_stereo_samples.append(row['description'])
            anti_stereo_samples.append(sen)
        
        elif row['job'] in female_biased_jobs and row['gender']=='M':
        
            pro_stereo_samples.append(sen)
            anti_stereo_samples.append(row['description'])
        
      
        elif row['job'] in male_biased_jobs and row['gender']=='F':
        
            pro_stereo_samples.append(sen)
        
            anti_stereo_samples.append(row['description'])
                


train_samples, test_samples= train_test_split(samples, shuffle=False, test_size=0.30)

gf_train_samples, gf_test_samples= train_test_split(gf_samples, shuffle=False, test_size=0.30)

pro_stereo_train_samples, pro_stereo_test_samples= train_test_split(pro_stereo_samples, shuffle=False, test_size=0.30)

anti_stereo_train_samples, anti_stereo_test_samples= train_test_split(anti_stereo_samples, shuffle=False, test_size=0.30)

neutral_train_samples, neutral_test_samples= train_test_split(neutral_samples, shuffle=False, test_size=0.30)


#balanced dataset
balanced_train = np.concatenate([train, gf_train])
balanced_valid = np.concatenate([valid, gf_valid])

balanced_samples_train=np.concatenate((train_samples, gf_train_samples))
balanced_samples_test= np.concatenate((test_samples, gf_test_samples))




In [None]:
       
#checking existing dirs

output_directory = '../../data/CommonCrawl'

if not os.path.exists(output_directory):  
    os.makedirs(output_directory)


if not os.path.exists(f'{output_directory}/original'):
  os.makedirs(f'{output_directory}/original')
  os.makedirs(f'{output_directory}/original/full')
    
if not os.path.exists(f'{output_directory}/gender-swapped'):
  os.makedirs(f'{output_directory}/gender-swapped')
  os.makedirs(f'{output_directory}/gender-swapped/full')
    
if not os.path.exists(f'{output_directory}/balanced'):
  os.makedirs(f'{output_directory}/balanced')
  os.makedirs(f'{output_directory}/balanced/full')
    
if not os.path.exists(f'{output_directory}/pro-stereotype'):
  os.makedirs(f'{output_directory}/pro-stereotype')
  os.makedirs(f'{output_directory}/pro-stereotype/full')

if not os.path.exists(f'{output_directory}/anti-stereotype'):
  os.makedirs(f'{output_directory}/anti-stereotype')
  os.makedirs(f'{output_directory}/anti-stereotype/full')

if not os.path.exists(f'{output_directory}/neutral'):
  os.makedirs(f'{output_directory}/neutral')
  os.makedirs(f'{output_directory}/neutral/full')




    
with open(f'{output_directory}/neutral/train.jsonl', 'w') as f:
  for obj in neutral_train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/neutral/valid.jsonl', 'w') as f:
  for obj in neutral_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/neutral/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
        

with open(f'{output_directory}/neutral/full/full-train.txt','w') as f:
  f.write('\n'.join(neutral_train_samples))
    
    
with open(f'{output_directory}/neutral/full/full-test.txt','w') as f:
  f.write('\n'.join(neutral_test_samples))
    
    


#pro-stereo

with open(f'{output_directory}/pro-stereotype/train.jsonl', 'w') as f:
  for obj in pro_stereo_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/pro-stereotype/valid.jsonl', 'w') as f:
  for obj in pro_stereo_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/pro-stereotype/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')

              
with open(f'{output_directory}/pro-stereotype/full/full-train.txt','w') as f:
    f.write('\n'.join(pro_stereo_train_samples))
 
    

with open(f'{output_directory}/pro-stereotype/full/full-test.txt','w') as f:
    f.write('\n'.join(pro_stereo_test_samples))



#anti-stereo 

with open(f'{output_directory}/anti-stereotype/train.jsonl', 'w') as f:
  for obj in anti_stereo_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/anti-stereotype/valid.jsonl', 'w') as f:
  for obj in anti_stereo_valid:
    f.write(json.dumps(obj) + '\n')


with open(f'{output_directory}/anti-stereotype/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')


with open(f'{output_directory}/anti-stereotype/full/full-train.txt','w') as f:
  f.write('\n'.join(anti_stereo_train_samples))


with open(f'{output_directory}/anti-stereotype/full/full-test.txt','w') as f:
  f.write('\n'.join(anti_stereo_test_samples))
    



    
with open(f'{output_directory}/original/train.jsonl', 'w') as f:
  for obj in train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/valid.jsonl', 'w') as f:
  for obj in valid:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/full/full-train.txt','w') as f:
    f.write('\n'.join(train_samples))
    
    
with open(f'{output_directory}/original/full/full-test.txt','w') as f:
    f.write('\n'.join(test_samples))
    
    
# writing gender-swapped datasets 

with open(f'{output_directory}/gender-swapped/train.jsonl', 'w') as f:
  #json.dump(gender_swapped_dataset, jsonfile)
  for obj in gf_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/gender-swapped/valid.jsonl', 'w') as f:
  #json.dump(gender_swapped_dataset, jsonfile)
  for obj in gf_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/gender-swapped/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')

                       
# write full datasets  

with open(f'{output_directory}/gender-swapped/full/full-train.txt','w') as f:
  f.write('\n'.join(gf_train_samples))

with open(f'{output_directory}/gender-swapped/full/full-test.txt','w') as f:
  f.write('\n'.join(gf_test_samples))
 


# writing balanceed datasets  
        
with open(f'{output_directory}/balanced/train.jsonl', 'w') as f:
  for obj in balanced_train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/balanced/valid.jsonl', 'w') as f:
  for obj in balanced_valid:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/balanced/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
    
with open(f'{output_directory}/balanced/full/full-train.txt','w') as f:
  f.write('\n'.join(balanced_train_samples))
    
with open(f'{output_directory}/balanced/full/full-test.txt','w') as f:
  f.write('\n'.join(balanced_test_samples))



 

