# Names
Parse name data from SSA.gov dataset.

In [18]:
import io, os, requests, zipfile

url="https://www.ssa.gov/oact/babynames/names.zip"
directory="./data/names"

if not os.path.exists(directory):
    temp="./data/names.zip"
    
    r = requests.get(url)
    assert r.ok
    with zipfile.ZipFile(io.BytesIO(r.content)) as zip:
        zip.extractall(directory)

In [28]:
startYear=1991
endYear=2021
desiredSex="F"

combined_filename=os.path.join(directory, f"combined-{startYear}-{endYear}-{desiredSex}.csv")
if not os.path.exists(combined_filename):

    # Read all the names from each year between start..end
    #    1. Filter to one gender
    #    2. Create a column for the name frequency each year
    #    3. Write to a new csv file
    all_the_names={}
    for year in range(startYear, endYear + 1):
        filename=os.path.join(directory, f"yob{year}.txt")
        with open(filename, "r") as names:
            for line in names:
                name, sex, number = line.split(",")
                if sex != desiredSex:
                    continue

                name = str.strip(name)
                if name not in all_the_names.keys():
                    all_the_names[name] = {}

                all_the_names[name][year] = str.strip(number)
    
    with open(combined_filename, "w") as combined:
        combined.write("Name")
        for year in range(startYear, endYear + 1):
            combined.write(f",{year}")
        combined.write("\n")

        for name, numbers in all_the_names.items():
            combined.write(str.strip(name))
            for year in range(startYear, endYear + 1):
                yearNumber = 0 if year not in numbers.keys() else numbers[year]
                combined.write(f",{yearNumber}")
            combined.write("\n")

In [31]:
import numpy as np
import pandas as pd

names = pd.read_csv(combined_filename)
names.head()

Unnamed: 0,Name,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Ashley,43481,38457,34850,30282,26601,23679,20895,19874,18136,...,4700,3938,3558,3425,3042,2527,2277,2022,1796,1690
1,Jessica,43398,38358,34990,32120,27938,24201,21044,18236,16349,...,2333,1949,1800,1589,1389,1286,1066,966,777,647
2,Brittany,29091,24982,21723,18900,16476,13796,11528,9844,7942,...,769,720,698,688,572,402,307,318,264,282
3,Amanda,28893,25035,20813,18719,16352,13977,12242,10921,9748,...,1234,1071,1057,1030,1005,962,853,768,690,653
4,Samantha,25647,24410,23669,22824,21644,20551,20170,20194,19040,...,6934,6511,5731,5344,4954,4325,3915,3366,2945,2584


In [34]:
names.sort_values(by="2021", ascending=False)

Unnamed: 0,Name,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
60,Olivia,5602,5809,6340,6435,7624,8124,9477,10610,11255,...,17323,18440,19826,19713,19386,18753,18022,18534,17641,17728
122,Emma,2596,3213,4108,4765,5048,6145,7748,10448,11743,...,20960,20954,20949,20468,19531,19847,18786,17176,15656,15433
286,Charlotte,983,963,943,942,999,945,954,997,1002,...,7479,9306,10118,11407,13106,12955,13018,13206,13065,13285
193,Amelia,1536,1345,1248,1254,1206,1291,1317,1424,1421,...,7245,8042,8801,9868,10791,11852,12373,12914,12767,12952
846,Ava,236,239,285,276,279,285,362,830,1215,...,15542,15259,15715,16391,16329,15998,14998,14515,13160,12759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29109,Rickira,0,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
29108,Rhyana,0,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
13048,Trier,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29106,Remya,0,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,0
