# Popularity of Baby Names
Analysis of the popularity of baby names using data from the SSA.gov dataset.

In [None]:
import io, os, requests, zipfile

url="https://www.ssa.gov/oact/babynames/names.zip"
directory="./data/"
namesDirectory=os.path.join(directory, "names")

if not os.path.exists(namesDirectory):
    r = requests.get(url)
    assert r.ok
    with zipfile.ZipFile(io.BytesIO(r.content)) as zip:
        zip.extractall(namesDirectory)

Combine the yearly data into a single CSV. Filter to a single gender while we're at it, mostly because it simplifies the name key in the dictionary I'm using to combine data.


In [None]:
startYear=1921
endYear=2021

combined_filename=os.path.join(directory, f"combined-{startYear}-{endYear}.csv")
if not os.path.exists(combined_filename):

    # Read all the names from each year between start..end
    #    1. Merge into a single CSV keyed by Name + Sex
    #    2. Write to a new csv file with a column per year
    all_the_names={}
    for year in range(startYear, endYear + 1):
        filename=os.path.join(namesDirectory, f"yob{year}.txt")
        with open(filename, "r") as names:
            for line in names:
                name, sex, count = line.split(",")
                key = (str.strip(name), str.strip(sex))
                count = int(count)

                if key not in all_the_names.keys():
                    all_the_names[key] = {}

                all_the_names[key][year] = count
    
    with open(combined_filename, "w") as combined_csv:
        combined_csv.write("Name,Sex")
        for year in range(startYear, endYear + 1):
            combined_csv.write(f",{year}")
        combined_csv.write("\n")

        for key, counts in all_the_names.items():
            name, sex = key
            combined_csv.write(f"{name},{sex}")
            for year in range(startYear, endYear + 1):
                combined_csv.write(f",{0 if year not in counts.keys() else counts[year]}")
            combined_csv.write("\n")

Load the combined file into a Pandas Dataframe. Split the dataframe by gender and normalize to *approximately* the count per million. Disclaimer: I don't know this data was sampled.

In [None]:
import numpy as np
import pandas as pd

names = pd.read_csv(combined_filename).set_index(['Sex', 'Name'])

names_female = names[names.index.get_level_values('Sex') == "F"]
names_female = pd.concat([(1_000_000 * series / series.sum()).round().astype(int) for header, series in names_female.items()], axis=1)

names_male = names[names.index.get_level_values('Sex') == "M"]
names_male = pd.concat([(1_000_000 * series / series.sum()).round().astype(int) for header, series in names_male.items()], axis=1)

names = pd.concat([names_female, names_male])

# Fuzzy Name Matching

Many names in the SSA.gov dataset are the same phonetic name with different spellings. This section uses the Fuzzy phonetic matching algorithm to group names phonetically.

In [None]:
import fuzzy

algorithm = fuzzy.nysiis
algorithm_name = 'nysiis'

yearToFuzz="2021"

fuzzy_filename=os.path.join(directory, f"fuzzy-{algorithm_name}-{yearToFuzz}.csv")

if not os.path.exists(fuzzy_filename):
    names_fuzzy = names[yearToFuzz].reset_index()
    names_fuzzy[algorithm_name] = names_fuzzy['Name'].apply(algorithm)

    # Group rows by their phonetic spelling
    # Count all alternative spellings together
    # Take the most common spelling as the representitive name, but print out alternatives too
    with open(fuzzy_filename, "w") as fuzzy_csv:
        fuzzy_csv.write("Name,Sex,PhoneticName,Count,Alternate Spellings, Alternate Spelling Details\n")
        for key, group in names_fuzzy.groupby(by=['Sex', algorithm_name]):
            sex, phonetic_name = key
            sorted_group = group[group[yearToFuzz] > 0].sort_values(by=yearToFuzz, ascending=False)

            # Only count groups that have at least one spelling for this year
            row_count = len(sorted_group.index)
            if (row_count == 0):
                continue

            most_common = sorted_group.iloc[0]
            most_common_name = most_common['Name']

            group_sum = sorted_group[yearToFuzz].sum()
            group_alternate_spellings_count = row_count - 1
            group_summary = str.join('/', [ f"{row['Name']}({row[yearToFuzz]})" for key, row in sorted_group.iterrows()])

            fuzzy_csv.write(f"{most_common_name},{sex},{phonetic_name},{group_sum},{group_alternate_spellings_count},{group_summary}\n")

names_fuzzy = pd.read_csv(fuzzy_filename).set_index(['Sex', 'Name'])

names_female_fuzzy = names_fuzzy[names_fuzzy.index.get_level_values('Sex') == "F"]
names_male_fuzzy = names_fuzzy[names_fuzzy.index.get_level_values('Sex') == "M"]

## Top Name Trends
Plot the popularity of the top 10 names from certain years.

In [None]:
yearsToPlot = [ 1921, 1961, 1991, 2021 ]

for year in yearsToPlot:
    topFemaleNames = names_female.nlargest(5, f"{year}")
    topMaleNames = names_male.nlargest(5, f"{year}")
    pd.concat([topFemaleNames,topMaleNames]) \
        .sort_values(by=f"{year}", ascending=False) \
        .transpose() \
        .plot(title=f"Most Popular Names of {year}", ylabel="Count per Million", figsize=(14,5))


## Specific Names Trends

Plot specific names over time

In [None]:
names_to_plot = [ ('F', 'Erin'), ( 'F', 'Erica' ) ] 
names.loc[names_to_plot] \
    .sort_values(by=f"{year}", ascending=False) \
    .transpose() \
    .plot(title=f"Name Popularity from {startYear} to {endYear}", ylabel="Count per Million", figsize=(14,5))