#Setup

**Mount on drive** (if run on colab)

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)
Folder_name = 'GenderBiasFewShotText'
assert Folder_name is not None, "[1] Enter the folder name"
import sys 
sys.path.append('content/drive/MyDrive/{}'.format(Folder_name))
%cd drive/MyDrive/$Folder_name/




Mounted at /content/drive
/content/drive/MyDrive/GenderBiasFewShotText


In [2]:
#execute this cell only if you run on colab
BASE_PATH = 'gender_data_substitution/src'
%cd {BASE_PATH}

/content/drive/MyDrive/GenderBiasFewShotText/gender_data_substitution/src


**Install**

In [3]:
!pip install spacy
!pip install c



**Imports**

In [4]:
import pandas as pd
import pickle
import re
import os
import numpy as np
import csv
import json
from sklearn.model_selection import train_test_split
import itertools
import spacy
from operator import itemgetter


#Dataset

**remove duplicates from the data and drop unnecessary columns**

In [5]:
data_path = '../data_utilities/'
data=pd.read_csv(data_path+'WikipediaGenderEvents.csv')
print(f"Length of data before dropping duplicates: {data.shape[0]}")
data=data.drop(['Event', 'Keyword'], axis = 1)
data = pd.DataFrame(data).drop_duplicates(subset=["Name", "Gender", "Occupation", "Text"])
print(f"Length of data after dropping duplicates: {data.shape[0]}")
data.head()

Length of data before dropping duplicates: 581
Length of data after dropping duplicates: 555


Unnamed: 0,Name,Gender,Occupation,Text
0,Dolester Miles,F,chefs,"In 1982, Frank Stitt opened Highlands Bar and ..."
1,Nancy Silverton,F,chefs,After Silverton returned to Los Angeles in 198...
2,Beau MacMillan,M,chefs,MacMillan moved to Los Angeles to become sous ...
3,Anthony Amoroso,M,chefs,"In 2003, Amoroso relocated to Las Vegas, Nevad..."
4,Debra Ponzek,F,chefs,Ponzek married fellow chef Bobby Flay in May 1...


# odd ratio

**check if the data is gender unbalanced**

In [6]:
def odds_ratio(female_dict, male_dict, topk=1000, threshold=1):
    #very_small_value = 0.00001
    if len(female_dict.keys()) != len(male_dict.keys()):
        raise Exception("The category for analyzing the male and female should be the same!")
    else:
        odds_ratio = {}
        total_num_female = sum(female_dict.values())
        total_num_male = sum(male_dict.values())
        for key in female_dict.keys():
            male_num = male_dict[key]
            female_num = female_dict[key]
            non_female_num = total_num_female - female_num
            non_male_num = total_num_male - male_num
            if female_num >= threshold and male_num >= threshold:
                # we only consider the events where there are at least {thresohld} occurences for both gender
                odds_ratio[key] = round((male_num / female_num) / (non_male_num / non_female_num), 2)
            else:
                continue
        return dict(sorted(odds_ratio.items(), key=itemgetter(1), reverse=True)[:topk]), dict(

            sorted(odds_ratio.items(), key=itemgetter(1))[:topk])

In [7]:
categories = data['Occupation'].unique().tolist()
counts = data.groupby(['Occupation', 'Gender']).size().unstack('Gender').reset_index()
counts['dominant'] = np.where(counts['F']>= counts['M'], 'F', 'M')
counts['total'] = counts['F']+counts['M']
counts['occupation % for F']= round(counts['F']/counts['total']*100,1)
counts['occupation % for M']= round(counts['M']/counts['total']*100,1)
counts=counts.sort_values(by=['occupation % for F'])


female_biased_jobs=[]
male_biased_jobs=[]

for index, row in counts.iterrows():
    if row["dominant"]=='F':
        female_biased_jobs.append(row["Occupation"])
    else:
        male_biased_jobs.append(row["Occupation"])


print(f"female biased jobs: {female_biased_jobs}")
print(f"male biased jobs: {male_biased_jobs}")



female biased jobs: ['writers', 'artists', 'models']
male biased jobs: ['musicians', 'podcasters', 'dancers', 'chefs', 'comedians']


In [8]:
female_dict={}
male_dict={}

for index, row in counts.iterrows():
    female_dict[row["Occupation"]]=row['F']
    male_dict[row["Occupation"]]=row['M']
    

odds_m_c, odds_f_c = odds_ratio(female_dict, male_dict)

print(f"Odd ratio for male dominant jobs: {odds_m_c}")
print(f"Odd ratio for female dominant jobs: {odds_f_c}")




Odd ratio for male dominant jobs: {'musicians': 1.25, 'podcasters': 1.2, 'dancers': 1.07, 'chefs': 1.03, 'comedians': 1.0, 'writers': 0.93, 'artists': 0.85, 'models': 0.78}
Odd ratio for female dominant jobs: {'models': 0.78, 'artists': 0.85, 'writers': 0.93, 'comedians': 1.0, 'chefs': 1.03, 'dancers': 1.07, 'podcasters': 1.2, 'musicians': 1.25}


**add biased classes (jobs) for both female and male in train, valid, and test split types**

In [9]:
#make sure to have biased jobs in train, valid, and test part
np.random.shuffle(female_biased_jobs)
np.random.shuffle(male_biased_jobs)
female_occupations_split = np.array_split(female_biased_jobs, 3)
male_occupations_split = np.array_split(male_biased_jobs, 3)

train_occupations=np.concatenate((female_occupations_split[0], male_occupations_split[0]))
valid_occupations=np.concatenate((female_occupations_split[1], male_occupations_split[1]))
test_occupations=np.concatenate((female_occupations_split[2], male_occupations_split[2]))


print(f"Classes for train split: {train_occupations}")
print(f"Classes for valid split: {valid_occupations}")
print(f"Classes for test split: {test_occupations}")




Classes for train split: ['writers' 'chefs' 'comedians']
Classes for valid split: ['models' 'dancers' 'musicians']
Classes for test split: ['artists' 'podcasters']


#Flipping Gender 

**In this section, the goal is to generate: gender flipped, neutral, pro-stereotype, anti-stereotype, and balanced datasets from the original dataset. For more information on how to create these datasets, please refer to the report**

In [10]:
nlp=spacy.load('en_core_web_sm')
import sys

from substitutor import Substitutor
from substitutor import load_json_pairs

gender_pairs = load_json_pairs(f'{data_path}/gender_pairs.json')
neutral_pairs = load_json_pairs(f'{data_path}/neutral_pairs.json')
full_names = json.loads(open(f'{data_path}/gender_names.json', "rb").readlines()[0])


# Initialise a substitutor with a list of pairs of gendered words (and optionally names)
substitutor = Substitutor(gender_pairs, full_names, neutral_pairs)
# Example text which requires NER and POS information to properly invert
def gender_flip(text):
    flipped = substitutor.invert_text_gender(text)
    return flipped

def neutral_flip(text):
    flipped = substitutor.invert_text_neutral(text)
    return flipped



**Example on how to flip a text into opposite gender and gender free (neutral)text**

In [11]:
text = 'Mike is nice, Lynn and James are nice"'
gender_flipped = gender_flip(text)
gender_free = neutral_flip(text)
print(f"original text: {text}")
print(f"gender flipped text: {gender_flipped}")
print(f"gender free: {gender_free}")

original text: Mike is nice, Lynn and James are nice"
gender flipped text: Asma is nice, Kai and Jezabel are nice"
gender free: E3 is nice, E2 and E1 are nice"


**Generate the datasets**

In [12]:
train=[]
valid=[]
test=[]
samples=[]

#gf for gender flipped

gf_train=[]
gf_valid=[]
gf_samples=[]


neutral_train=[]
neutral_valid=[]
neutral_samples=[]

pro_stereo_train=[]
pro_stereo_valid=[]
pro_stereo_samples=[]


anti_stereo_train=[]
anti_stereo_valid=[]
anti_stereo_samples=[]



for index, row in data.iterrows():
    samples.append(row['Text'])
    sen=gender_flip(row['Text']) 
    gf_samples.append(sen)
    sen_neutral=neutral_flip(row['Text']) 
    neutral_samples.append(sen_neutral)
    
    if row['Occupation'] in train_occupations:
        train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
        gf_train.append({'label':row['Occupation'], 'gender':row['Gender'],'sentence': sen})
        neutral_train.append({'label':row['Occupation'], 'gender':row['Gender'],'sentence': sen_neutral})
        
        if row['Occupation'] in female_biased_jobs and row['Gender']=='F':
            pro_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            pro_stereo_samples.append(row['Text'])
        
            anti_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
            
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='M':
            pro_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            pro_stereo_samples.append(row['Text'])
        
            anti_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
        
        elif row['Occupation'] in female_biased_jobs and row['Gender']=='M':
        
            pro_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            anti_stereo_samples.append(row['Text'])
        
      
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='F':
        
            pro_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_train.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            anti_stereo_samples.append(row['Text'])
                
            
            
    elif row['Occupation'] in valid_occupations:
        valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
        gf_valid.append({'label':row['Occupation'], 'gender':row['Gender'], 'sentence':sen})
        neutral_valid.append({'label':row['Occupation'],'gender':row['Gender'], 'sentence': sen_neutral})
        
        if row['Occupation'] in female_biased_jobs and row['Gender']=='F':
            pro_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            pro_stereo_samples.append(row['Text'])
        
            anti_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
            
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='M':
            pro_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            pro_stereo_samples.append(row['Text'])
        
            anti_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            anti_stereo_samples.append(sen)
        
        elif row['Occupation'] in female_biased_jobs and row['Gender']=='M':
        
            pro_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            anti_stereo_samples.append(row['Text'])
        
      
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='F':
        
            pro_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': sen})
            pro_stereo_samples.append(sen)
        
            anti_stereo_valid.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
            anti_stereo_samples.append(row['Text'])
                
        
    else:
        test.append({'label':row['Occupation'] , 'gender':row['Gender'], 'sentence': row['Text']})
        if row['Occupation'] in female_biased_jobs and row['Gender']=='F':
            pro_stereo_samples.append(row['Text'])
            anti_stereo_samples.append(sen)
            
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='M':
            pro_stereo_samples.append(row['Text'])
            anti_stereo_samples.append(sen)
        
        elif row['Occupation'] in female_biased_jobs and row['Gender']=='M':
        
            pro_stereo_samples.append(sen)
            anti_stereo_samples.append(row['Text'])
        
      
        elif row['Occupation'] in male_biased_jobs and row['Gender']=='F':
        
            pro_stereo_samples.append(sen)
        
            anti_stereo_samples.append(row['Text'])
                


        
train_samples, test_samples= train_test_split(samples, shuffle=False, test_size=0.30)

gf_train_samples, gf_test_samples= train_test_split(gf_samples, shuffle=False, test_size=0.30)

pro_stereo_train_samples, pro_stereo_test_samples= train_test_split(pro_stereo_samples, shuffle=False, test_size=0.30)

anti_stereo_train_samples, anti_stereo_test_samples= train_test_split(anti_stereo_samples, shuffle=False, test_size=0.30)

neutral_train_samples, neutral_test_samples= train_test_split(neutral_samples, shuffle=False, test_size=0.30)



#balanced dataset
balanced_train = np.concatenate([train, gf_train])
balanced_valid = np.concatenate([valid, gf_valid])


balanced_train_samples=np.concatenate((train_samples, gf_train_samples))
balanced_test_samples= np.concatenate((test_samples, gf_test_samples))
    

In [14]:
      
#checking existing dirs

output_directory = '../../data/WikipediaGenderEvents'

if not os.path.exists(output_directory):  
    os.makedirs(output_directory)


if not os.path.exists(f'{output_directory}/original'):
  os.makedirs(f'{output_directory}/original')
  os.makedirs(f'{output_directory}/original/full')
    
if not os.path.exists(f'{output_directory}/gender-swapped'):
  os.makedirs(f'{output_directory}/gender-swapped')
  os.makedirs(f'{output_directory}/gender-swapped/full')
    
if not os.path.exists(f'{output_directory}/balanced'):
  os.makedirs(f'{output_directory}/balanced')
  os.makedirs(f'{output_directory}/balanced/full')
    
if not os.path.exists(f'{output_directory}/pro-stereotype'):
  os.makedirs(f'{output_directory}/pro-stereotype')
  os.makedirs(f'{output_directory}/pro-stereotype/full')

if not os.path.exists(f'{output_directory}/anti-stereotype'):
  os.makedirs(f'{output_directory}/anti-stereotype')
  os.makedirs(f'{output_directory}/anti-stereotype/full')

if not os.path.exists(f'{output_directory}/neutral'):
  os.makedirs(f'{output_directory}/neutral')
  os.makedirs(f'{output_directory}/neutral/full')




    
with open(f'{output_directory}/neutral/train.jsonl', 'w') as f:
  for obj in neutral_train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/neutral/valid.jsonl', 'w') as f:
  for obj in neutral_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/neutral/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
        

with open(f'{output_directory}/neutral/full/full-train.txt','w') as f:
  f.write('\n'.join(neutral_train_samples))
    
    
with open(f'{output_directory}/neutral/full/full-test.txt','w') as f:
  f.write('\n'.join(neutral_test_samples))
    
    


#pro-stereo

with open(f'{output_directory}/pro-stereotype/train.jsonl', 'w') as f:
  for obj in pro_stereo_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/pro-stereotype/valid.jsonl', 'w') as f:
  for obj in pro_stereo_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/pro-stereotype/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')

              
with open(f'{output_directory}/pro-stereotype/full/full-train.txt','w') as f:
    f.write('\n'.join(pro_stereo_train_samples))
 
    

with open(f'{output_directory}/pro-stereotype/full/full-test.txt','w') as f:
    f.write('\n'.join(pro_stereo_test_samples))



#anti-stereo 

with open(f'{output_directory}/anti-stereotype/train.jsonl', 'w') as f:
  for obj in anti_stereo_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/anti-stereotype/valid.jsonl', 'w') as f:
  for obj in anti_stereo_valid:
    f.write(json.dumps(obj) + '\n')


with open(f'{output_directory}/anti-stereotype/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')


with open(f'{output_directory}/anti-stereotype/full/full-train.txt','w') as f:
  f.write('\n'.join(anti_stereo_train_samples))


with open(f'{output_directory}/anti-stereotype/full/full-test.txt','w') as f:
  f.write('\n'.join(anti_stereo_test_samples))
    



    
with open(f'{output_directory}/original/train.jsonl', 'w') as f:
  for obj in train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/valid.jsonl', 'w') as f:
  for obj in valid:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/original/full/full-train.txt','w') as f:
    f.write('\n'.join(train_samples))
    
    
with open(f'{output_directory}/original/full/full-test.txt','w') as f:
    f.write('\n'.join(test_samples))
    
    
# writing gender-swapped datasets 

with open(f'{output_directory}/gender-swapped/train.jsonl', 'w') as f:
  #json.dump(gender_swapped_dataset, jsonfile)
  for obj in gf_train:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/gender-swapped/valid.jsonl', 'w') as f:
  #json.dump(gender_swapped_dataset, jsonfile)
  for obj in gf_valid:
    f.write(json.dumps(obj) + '\n')

with open(f'{output_directory}/gender-swapped/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')

                       
# write full datasets  

with open(f'{output_directory}/gender-swapped/full/full-train.txt','w') as f:
  f.write('\n'.join(gf_train_samples))

with open(f'{output_directory}/gender-swapped/full/full-test.txt','w') as f:
  f.write('\n'.join(gf_test_samples))
 


# writing balanceed datasets  
        
with open(f'{output_directory}/balanced/train.jsonl', 'w') as f:
  for obj in balanced_train:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/balanced/valid.jsonl', 'w') as f:
  for obj in balanced_valid:
    f.write(json.dumps(obj) + '\n')
        
with open(f'{output_directory}/balanced/test.jsonl', 'w') as f:
  for obj in test:
    f.write(json.dumps(obj) + '\n')
    
with open(f'{output_directory}/balanced/full/full-train.txt','w') as f:
  f.write('\n'.join(balanced_train_samples))
    
with open(f'{output_directory}/balanced/full/full-test.txt','w') as f:
  f.write('\n'.join(balanced_test_samples))


