Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

[Names Bias] Code for creating sets of names #4836

Merged
merged 5 commits into from Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions projects/dialogue_bias/README.md
Expand Up @@ -13,6 +13,7 @@ All AI models are susceptible to learning biases in data that they are trained o
## Code

- `projects.dialogue_bias.agents:NoBiasStyleGenAgent`: Agent that appends a `"no_bias"` string to the context of every example in order to perform controllable generation.
- In `util.py`, `get_gender_name_list()` and `get_race_ethnicity_gender_name_list()` will process and output the two sets of names used in our work. They require as inputs the raw datasets of names from the papers that we drew names from: see docstrings for details.

## Models

Expand Down
225 changes: 225 additions & 0 deletions projects/dialogue_bias/util.py
@@ -0,0 +1,225 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
from collections import defaultdict
from typing import Dict, List, Tuple

import pandas as pd


RACES_ETHNICITIES = ['hispanic', 'white', 'black', 'api', 'aian', '2prace']
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the spellings of the last two items here intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's how Tzioumis spelled them - adding a comment for this

# Notations for races/ethnicities reflect those used in Tzioumis et al. (see
# https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/TYJKEZ/
# MPMHFE&version=1.3 for details)
RACES_ETHNICITIES_WITH_NAMES = ['hispanic', 'white', 'black', 'api']
# Some races/ethnicities don't have any names on the Tzioumis list for which they are
# the plurality race/ethnicity; we exclude those races/ethnicities from this list.


def get_gender_name_list(gender: str, names_path: str) -> List[str]:
"""
Return a list of names of the specified gender from Newman, et al.

Read names from https://journals.sagepub.com/doi/abs/10.1177/0146167218769858 and
filter by the specified gender.
"""
name_df = pd.read_csv(names_path)
names = name_df[lambda df: df['Gender'].str.lower() == gender][
'Name'
].values.tolist()
print(f'Using {len(names):d} {gender} names: ' + ', '.join(names))
return names


def get_race_ethnicity_gender_name_list(
baby_name_folder: str,
tzioumis_data_path: str,
race_gender_name_lists: Dict[str, List[str]],
) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
"""
Get name lists split by both race/ethnicity and by gender, given input name lists
(see inner functions for the sources of these lists).
"""

baby_name_counts_by_gender = get_baby_name_counts_by_gender(baby_name_folder)

# Split original name lists by gender
orig_name_lists = get_tzioumis_name_lists(
tzioumis_data_path=tzioumis_data_path,
race_gender_name_lists=race_gender_name_lists,
)
print('\nSplitting original name lists by gender.')
names_to_new_lists = {}
for name_list, names in orig_name_lists.items():
for name in names:
proc_name = name.replace('-', '')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this will concatenate hyphenated names. Why is that required?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's to match the formatting of the baby-name lists - adding a comment for this

proc_name = proc_name[0].upper() + proc_name[1:].lower()
# Removing hyphens and changing capitalization to match the formatting of
# the baby-name lists
if (
baby_name_counts_by_gender[proc_name]['F']
> baby_name_counts_by_gender[proc_name]['M']
):
names_to_new_lists[name] = f'{name_list}_female'
elif (
baby_name_counts_by_gender[proc_name]['M']
> baby_name_counts_by_gender[proc_name]['F']
):
names_to_new_lists[name] = f'{name_list}_male'
else:
# Tie
names_to_new_lists[name] = f'{name_list}_unknown'
assert len(names_to_new_lists) == sum(
[len(names) for names in orig_name_lists.values()]
)

# Print the names in each of the new name lists
new_name_lists = defaultdict(list)
for name, name_list in names_to_new_lists.items():
new_name_lists[name_list].append(name)
for name_list in sorted(new_name_lists.keys()):
sorted_names = sorted(new_name_lists[name_list])
print(
f'\nUsing {len(sorted_names):d} names for the {name_list} name list: '
+ ', '.join(sorted_names)
)

return names_to_new_lists, new_name_lists


def get_baby_name_counts_by_gender(baby_name_folder: str) -> Dict[str, Dict[str, int]]:
"""
Return a dictionary whose keys are baby names and whose values are counts of the
number of babies given that name, split by gender.

Baby name folder from https://catalog.data.gov/dataset/
baby-names-from-social-security-card-applications-national-data,
accessed 2021-04-02.
"""

# Params
final_baby_name_year = 2019
baby_name_year_range = range(final_baby_name_year - 99, final_baby_name_year + 1)
# Get the most recent 100 years of names

# Get counts of baby names by gender
baby_name_counts_by_gender = defaultdict(lambda: {'F': 0, 'M': 0})
for year in baby_name_year_range:
counts_path = os.path.join(baby_name_folder, f'yob{year:d}.txt')
with open(counts_path) as f:
for line in f:
name, gender, count_string = line.split(',')
count = int(count_string.rstrip())
baby_name_counts_by_gender[name][gender] += count

return baby_name_counts_by_gender


def get_tzioumis_name_lists(
tzioumis_data_path: str,
race_gender_name_lists: Dict[str, List[str]],
) -> Dict[str, List[str]]:
"""
Get race/ethnicity name lists from the Tzioumis work.
"""
percent_df = load_tzioumis_data(tzioumis_data_path)
name_lists = {}
for race_ethnicity in RACES_ETHNICITIES_WITH_NAMES:
name_lists[race_ethnicity] = get_race_ethnicity_name_list_given_tzioumis_data(
percent_df=percent_df,
race_gender_name_lists=race_gender_name_lists,
race_ethnicity=race_ethnicity,
)
return name_lists


def load_tzioumis_data(tzioumis_data_path: str) -> pd.DataFrame:
"""
Load Tzioumis data from
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi%3A10.7910%2FDVN%2FTYJKEZ
(Paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5839157/)
"""
print(f'Loading Tzioumis data from {tzioumis_data_path}')
percent_df = (
pd.read_excel(tzioumis_data_path, sheet_name='Data')[
lambda df: df['firstname'] != 'ALL OTHER FIRST NAMES'
]
.assign(
firstname=lambda df: df['firstname'].apply(lambda s: s[0] + s[1:].lower())
)
.set_index('firstname')
)
return percent_df


def get_race_ethnicity_name_list_given_tzioumis_data(
percent_df: pd.DataFrame,
race_gender_name_lists: Dict[str, List[str]],
race_ethnicity: str,
) -> List[str]:
"""
Given input Tzioumis data (percent_df) and a set of names split by gender and
race/ethnicity from Milkman et al.

(2012), Caliskan et al. (2017), and Guo and Caliskan (2020)., get a list of names
for the given race/ethnicity.
"""

# Params
tzioumis_to_race_gender_mapping = {'hispanic': 'his', 'white': 'ea', 'black': 'aa'}
num_names_per_race_ethnicity = 200
# Number of names to select per race/ethnicity, to keep the lists tractable

# Determine which names are most commonly of the specified race/ethnicity (i.e.
# plurality), and pick those that have the most observations for that
# ethnicity
this_ethnicity_column = f'pct{race_ethnicity}'
percent_columns = [f'pct{race_eth}' for race_eth in RACES_ETHNICITIES]
max_percent_series = percent_df[percent_columns].max(axis=1)
percent_plurality_names_df = (
percent_df[lambda df: df[this_ethnicity_column] == max_percent_series]
.assign(
obs_of_this_ethnicity=lambda df: df['obs'] * df[this_ethnicity_column] / 100
)
.sort_values('obs_of_this_ethnicity', ascending=False)
)
tzioumis_plurality_names = percent_plurality_names_df.iloc[
:num_names_per_race_ethnicity
].index.values.tolist()

# Combine these names with the Caliskan+ race+gender names and deduplicate
if race_ethnicity in tzioumis_to_race_gender_mapping:
mapped_ethnicity = tzioumis_to_race_gender_mapping[race_ethnicity]
female_race_gender_name_list = race_gender_name_lists[
f'{mapped_ethnicity}_female'
]
if mapped_ethnicity == 'aa':
# Avoid the same name in two lists by removing it from this one
female_race_gender_name_list.remove('Yolanda')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a programmatic way to detect duplicates? That way it would just work for updated versions of the source datasets (assuming the schema stays the same).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call - haha this was obviously a hack to account for the current duplicates. Adding a TODO here

elif mapped_ethnicity == 'his':
# Avoid the same name in two lists by removing it from this one
female_race_gender_name_list.remove('Brenda')
# TODO: add a programmatic way to detect duplicates, to generalize for updated
# versions of the source datasets
male_race_gender_name_list = race_gender_name_lists[f'{mapped_ethnicity}_male']
combined_names = (
tzioumis_plurality_names
+ female_race_gender_name_list
+ male_race_gender_name_list
)
else:
combined_names = tzioumis_plurality_names

# Deduplicate and sort
sorted_names = sorted(list(set(combined_names)))

print(
f'Using {len(sorted_names):d} names for the {race_ethnicity} race/ethnicity: '
+ ', '.join(sorted_names)
)

return sorted_names