Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
[Names Bias] Code for creating sets of names (#4836)
Browse files Browse the repository at this point in the history
* Add name processing code

* Update util.py

* Update util.py

* Update util.py
  • Loading branch information
EricMichaelSmith committed Oct 31, 2022
1 parent b0b231d commit b9317bd
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 0 deletions.
1 change: 1 addition & 0 deletions projects/dialogue_bias/README.md
Expand Up @@ -13,6 +13,7 @@ All AI models are susceptible to learning biases in data that they are trained o
## Code

- `projects.dialogue_bias.agents:NoBiasStyleGenAgent`: Agent that appends a `"no_bias"` string to the context of every example in order to perform controllable generation.
- In `util.py`, `get_gender_name_list()` and `get_race_ethnicity_gender_name_list()` will process and output the two sets of names used in our work. They require as inputs the raw datasets of names from the papers that we drew names from: see docstrings for details.

## Models

Expand Down
225 changes: 225 additions & 0 deletions projects/dialogue_bias/util.py
@@ -0,0 +1,225 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
from collections import defaultdict
from typing import Dict, List, Tuple

import pandas as pd


RACES_ETHNICITIES = ['hispanic', 'white', 'black', 'api', 'aian', '2prace']
# Notations for races/ethnicities reflect those used in Tzioumis et al. (see
# https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TYJKEZ/
# MPMHFE&version=1.3 for details)
RACES_ETHNICITIES_WITH_NAMES = ['hispanic', 'white', 'black', 'api']
# Some races/ethnicities don't have any names on the Tzioumis list for which they are
# the plurality race/ethnicity; we exclude those races/ethnicities from this list.


def get_gender_name_list(gender: str, names_path: str) -> List[str]:
"""
Return a list of names of the specified gender from Newman, et al.
Read names from https://journals.sagepub.com/doi/abs/10.1177/0146167218769858 and
filter by the specified gender.
"""
name_df = pd.read_csv(names_path)
names = name_df[lambda df: df['Gender'].str.lower() == gender][
'Name'
].values.tolist()
print(f'Using {len(names):d} {gender} names: ' + ', '.join(names))
return names


def get_race_ethnicity_gender_name_list(
baby_name_folder: str,
tzioumis_data_path: str,
race_gender_name_lists: Dict[str, List[str]],
) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
"""
Get name lists split by both race/ethnicity and by gender, given input name lists
(see inner functions for the sources of these lists).
"""

baby_name_counts_by_gender = get_baby_name_counts_by_gender(baby_name_folder)

# Split original name lists by gender
orig_name_lists = get_tzioumis_name_lists(
tzioumis_data_path=tzioumis_data_path,
race_gender_name_lists=race_gender_name_lists,
)
print('\nSplitting original name lists by gender.')
names_to_new_lists = {}
for name_list, names in orig_name_lists.items():
for name in names:
proc_name = name.replace('-', '')
proc_name = proc_name[0].upper() + proc_name[1:].lower()
# Removing hyphens and changing capitalization to match the formatting of
# the baby-name lists
if (
baby_name_counts_by_gender[proc_name]['F']
> baby_name_counts_by_gender[proc_name]['M']
):
names_to_new_lists[name] = f'{name_list}_female'
elif (
baby_name_counts_by_gender[proc_name]['M']
> baby_name_counts_by_gender[proc_name]['F']
):
names_to_new_lists[name] = f'{name_list}_male'
else:
# Tie
names_to_new_lists[name] = f'{name_list}_unknown'
assert len(names_to_new_lists) == sum(
[len(names) for names in orig_name_lists.values()]
)

# Print the names in each of the new name lists
new_name_lists = defaultdict(list)
for name, name_list in names_to_new_lists.items():
new_name_lists[name_list].append(name)
for name_list in sorted(new_name_lists.keys()):
sorted_names = sorted(new_name_lists[name_list])
print(
f'\nUsing {len(sorted_names):d} names for the {name_list} name list: '
+ ', '.join(sorted_names)
)

return names_to_new_lists, new_name_lists


def get_baby_name_counts_by_gender(baby_name_folder: str) -> Dict[str, Dict[str, int]]:
"""
Return a dictionary whose keys are baby names and whose values are counts of the
number of babies given that name, split by gender.
Baby name folder from https://catalog.data.gov/dataset/
baby-names-from-social-security-card-applications-national-data,
accessed 2021-04-02.
"""

# Params
final_baby_name_year = 2019
baby_name_year_range = range(final_baby_name_year - 99, final_baby_name_year + 1)
# Get the most recent 100 years of names

# Get counts of baby names by gender
baby_name_counts_by_gender = defaultdict(lambda: {'F': 0, 'M': 0})
for year in baby_name_year_range:
counts_path = os.path.join(baby_name_folder, f'yob{year:d}.txt')
with open(counts_path) as f:
for line in f:
name, gender, count_string = line.split(',')
count = int(count_string.rstrip())
baby_name_counts_by_gender[name][gender] += count

return baby_name_counts_by_gender


def get_tzioumis_name_lists(
tzioumis_data_path: str,
race_gender_name_lists: Dict[str, List[str]],
) -> Dict[str, List[str]]:
"""
Get race/ethnicity name lists from the Tzioumis work.
"""
percent_df = load_tzioumis_data(tzioumis_data_path)
name_lists = {}
for race_ethnicity in RACES_ETHNICITIES_WITH_NAMES:
name_lists[race_ethnicity] = get_race_ethnicity_name_list_given_tzioumis_data(
percent_df=percent_df,
race_gender_name_lists=race_gender_name_lists,
race_ethnicity=race_ethnicity,
)
return name_lists


def load_tzioumis_data(tzioumis_data_path: str) -> pd.DataFrame:
"""
Load Tzioumis data from
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi%3A10.7910%2FDVN%2FTYJKEZ
(Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5839157/)
"""
print(f'Loading Tzioumis data from {tzioumis_data_path}')
percent_df = (
pd.read_excel(tzioumis_data_path, sheet_name='Data')[
lambda df: df['firstname'] != 'ALL OTHER FIRST NAMES'
]
.assign(
firstname=lambda df: df['firstname'].apply(lambda s: s[0] + s[1:].lower())
)
.set_index('firstname')
)
return percent_df


def get_race_ethnicity_name_list_given_tzioumis_data(
percent_df: pd.DataFrame,
race_gender_name_lists: Dict[str, List[str]],
race_ethnicity: str,
) -> List[str]:
"""
Given input Tzioumis data (percent_df) and a set of names split by gender and
race/ethnicity from Milkman et al.
(2012), Caliskan et al. (2017), and Guo and Caliskan (2020)., get a list of names
for the given race/ethnicity.
"""

# Params
tzioumis_to_race_gender_mapping = {'hispanic': 'his', 'white': 'ea', 'black': 'aa'}
num_names_per_race_ethnicity = 200
# Number of names to select per race/ethnicity, to keep the lists tractable

# Determine which names are most commonly of the specified race/ethnicity (i.e.
# plurality), and pick those that have the most observations for that
# ethnicity
this_ethnicity_column = f'pct{race_ethnicity}'
percent_columns = [f'pct{race_eth}' for race_eth in RACES_ETHNICITIES]
max_percent_series = percent_df[percent_columns].max(axis=1)
percent_plurality_names_df = (
percent_df[lambda df: df[this_ethnicity_column] == max_percent_series]
.assign(
obs_of_this_ethnicity=lambda df: df['obs'] * df[this_ethnicity_column] / 100
)
.sort_values('obs_of_this_ethnicity', ascending=False)
)
tzioumis_plurality_names = percent_plurality_names_df.iloc[
:num_names_per_race_ethnicity
].index.values.tolist()

# Combine these names with the Caliskan+ race+gender names and deduplicate
if race_ethnicity in tzioumis_to_race_gender_mapping:
mapped_ethnicity = tzioumis_to_race_gender_mapping[race_ethnicity]
female_race_gender_name_list = race_gender_name_lists[
f'{mapped_ethnicity}_female'
]
if mapped_ethnicity == 'aa':
# Avoid the same name in two lists by removing it from this one
female_race_gender_name_list.remove('Yolanda')
elif mapped_ethnicity == 'his':
# Avoid the same name in two lists by removing it from this one
female_race_gender_name_list.remove('Brenda')
# TODO: add a programmatic way to detect duplicates, to generalize for updated
# versions of the source datasets
male_race_gender_name_list = race_gender_name_lists[f'{mapped_ethnicity}_male']
combined_names = (
tzioumis_plurality_names
+ female_race_gender_name_list
+ male_race_gender_name_list
)
else:
combined_names = tzioumis_plurality_names

# Deduplicate and sort
sorted_names = sorted(list(set(combined_names)))

print(
f'Using {len(sorted_names):d} names for the {race_ethnicity} race/ethnicity: '
+ ', '.join(sorted_names)
)

return sorted_names

0 comments on commit b9317bd

Please sign in to comment.