In [3]:
import pandas as pd
import data_utils

# NYC Babynames Dataset

https://data.cityofnewyork.us/Health/Popular-Baby-Names/25th-nujf

Data was collected through birth registration in NYC between the years 2011 - 2018.

In [4]:
input_file = 'Popular_Baby_Names.csv'
df = pd.read_csv(input_file, index_col=0)

# Update First Name strings to match
df["Child's First Name"] = df["Child's First Name"].str.upper()

# Collapse some spelling variations
category_map = {'WHITE NON HISP': 'WHITE NON HISPANIC', 
                'BLACK NON HISP': 'BLACK NON HISPANIC', 
                'ASIAN AND PACI': 'ASIAN AND PACIFIC ISLANDER'}
for k, v in category_map.items():
    df.loc[df['Ethnicity'] == k, 'Ethnicity'] = v

In [50]:
# Note that I didn't drop duplicates before sampling, initially.  Here I drop them to get accurate observation counts.
# Since samples use aggregated data, dropping duplicates does not affect sampling.
df2 = df.drop_duplicates()
print(f"""There are {df2.groupby("Child's First Name").count().shape[0]} names from {df2['Count'].sum()} observations.""")

There are 1832 names from 477015 observations.


Next we sort by gender.  We include gender information for names that are >90% associated with a gender from the dataset.

In [53]:
# Get proportions of male/female in the data
temp = df.groupby(['Gender', "Child's First Name"]).sum()
f = temp.query("Gender=='FEMALE'")
m = temp.query("Gender=='MALE'")
sorted_names = f.merge(m, how='outer', on="Child's First Name", suffixes=('_MALE', '_FEMALE'))
sorted_names['proportion_female'] = sorted_names['Count_FEMALE'] / (sorted_names['Count_FEMALE'] + sorted_names['Count_MALE'])

# Find names that are not strongly associated with one gender
gendered_names = sorted_names.query("proportion_female.isnull() or proportion_female < 0.1 or proportion_female > 0.9", engine='python')
print(f"Found {sorted_names.shape[0] - gendered_names.shape[0]} names that were not >90% associated with a particular gender.")
weak_gender_association_names = sorted_names.query("proportion_female > 0.1 or proportion_female < 0.9").index.tolist()

# Create a dictionary to consider whether names are female or male 
gendered_names.eval('is_female = proportion_female.isnull() and Count_MALE>0 or proportion_female < 0.1', inplace=True)
is_name_female = gendered_names[['is_female']].to_dict()['is_female']

Found 22 names that were not >90% associated with a particular gender.


In [7]:
df_grouped = df.groupby(['Ethnicity', "Child's First Name"]).sum() / df.groupby("Child's First Name").sum() * 100
df_grouped.reset_index(inplace=True)
df_grouped['Name'] = df_grouped["Child's First Name"]
df_grouped = df_grouped[['Name', 'Ethnicity', 'Count']]

In [8]:
# Sampling names
given_names_df = pd.DataFrame()
for group in ["ASIAN AND PACIFIC ISLANDER", "WHITE NON HISPANIC", "BLACK NON HISPANIC", "HISPANIC"]:
    query_str = f"Ethnicity=='{group}'"
    temp = data_utils.name_lists_by_race(df_grouped.query(query_str), 'Count', label=group)
    temp = temp.rename(columns={'Name': 'GivenName'})
    temp['GivenName'] = temp['GivenName'].str.capitalize()
    given_names_df = pd.concat([given_names_df, temp])

# Add gender details
given_names_df['Gender'] = given_names_df['GivenName'].str.upper().map(is_name_female)
given_names_df['Gender'] = given_names_df['Gender'].map({True: 'F', False: 'M'})

Found 165 predominant ASIAN AND PACIFIC ISLANDER names
Found 592 predominant WHITE NON HISPANIC names
Found 226 predominant BLACK NON HISPANIC names
Found 389 predominant HISPANIC names


In [7]:
given_names_df.groupby(['Ethnicity', 'Gender']).count()[['Count']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Ethnicity,Gender,Unnamed: 2_level_1
ASIAN AND PACIFIC ISLANDER,F,86
ASIAN AND PACIFIC ISLANDER,M,77
BLACK NON HISPANIC,F,122
BLACK NON HISPANIC,M,101
HISPANIC,F,212
HISPANIC,M,175
WHITE NON HISPANIC,F,321
WHITE NON HISPANIC,M,265


In [9]:
given_names_df.head()

Unnamed: 0,GivenName,Ethnicity,Count,Gender
0,Aahil,ASIAN AND PACIFIC ISLANDER,100.0,M
2,Aarav,ASIAN AND PACIFIC ISLANDER,100.0,M
4,Aarya,ASIAN AND PACIFIC ISLANDER,100.0,F
5,Aaryan,ASIAN AND PACIFIC ISLANDER,100.0,M
6,Aayan,ASIAN AND PACIFIC ISLANDER,100.0,M


In [9]:
given_names_df.to_csv('./name_lists/NY.csv')