In [1]:
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import os

In [2]:
# download baby names zip file from ssa.gov
# extract all into ./data/names

os.chdir('..')
url = 'https://www.ssa.gov/oact/babynames/names.zip'
with urlopen(url) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall('./data/names')

In [3]:
# run through the files and append into single df

files = os.listdir('./data/names')
for file in files:
    if not file.endswith('.pdf'):
        df = pd.read_csv(f'./data/names/{file}', names=['name', 'sex', 'count'], header=None)
        df['year'] = file[3:7]
        try: # Check if names_df exists
            names_df   
        except NameError: # If it does not exist, create one
            start_year = file[3:7]
            names_df = df
        else: # If it exists, add the file to it
            names_df = pd.concat([names_df, df], ignore_index=True)

        print(f'Adding {file} ({len(df):,})to df')
        print(f'Total rows: {len(names_df):,}')
        print('')
        end_year = file[3:7]


Adding yob1880.txt (2,000)to df
Total rows: 2,000

Adding yob1881.txt (1,934)to df
Total rows: 3,934

Adding yob1882.txt (2,127)to df
Total rows: 6,061

Adding yob1883.txt (2,084)to df
Total rows: 8,145

Adding yob1884.txt (2,297)to df
Total rows: 10,442

Adding yob1885.txt (2,294)to df
Total rows: 12,736

Adding yob1886.txt (2,392)to df
Total rows: 15,128

Adding yob1887.txt (2,373)to df
Total rows: 17,501

Adding yob1888.txt (2,651)to df
Total rows: 20,152

Adding yob1889.txt (2,590)to df
Total rows: 22,742

Adding yob1890.txt (2,695)to df
Total rows: 25,437

Adding yob1891.txt (2,660)to df
Total rows: 28,097

Adding yob1892.txt (2,921)to df
Total rows: 31,018

Adding yob1893.txt (2,831)to df
Total rows: 33,849

Adding yob1894.txt (2,941)to df
Total rows: 36,790

Adding yob1895.txt (3,049)to df
Total rows: 39,839

Adding yob1896.txt (3,091)to df
Total rows: 42,930

Adding yob1897.txt (3,028)to df
Total rows: 45,958

Adding yob1898.txt (3,264)to df
Total rows: 49,222

Adding yob1899.t

In [22]:
# eda

# analysis 1 - compile a list of top 5 names by sex for each year and for all time
# identify top 5 baby names by year by sex
tmp1 = names_df.sort_values(by=['year', 'sex', 'count'], ascending=False).groupby(by=['year', 'sex']).head(5)
# identify top 5 baby names of all time by sex 
tmp2 = names_df[['name', 'sex', 'count']].groupby(by=['name', 'sex'], as_index=False).sum()
tmp2['year'] = 0
tmp2 = tmp2.sort_values(by=['year', 'sex', 'count'], ascending=False).groupby(by=['year', 'sex']).head(5)
# put the two lists together
top5_names_by_sex_by_year = pd.concat([tmp2, tmp1], ignore_index=True)
# output the list
top5_names_by_sex_by_year.to_csv(f'./data/top5_names_by_year_and_sex.txt', sep=',', index=False)

# analysis 2 - compile a list of top 5 gender-neutral names for each year and for all time
#

In [None]:
# drop columns not needed for name generation
names_df.drop(columns=['year', 'sex', 'count'], inplace=True)

# keep only unique names 
names_df.drop_duplicates(keep='first', inplace=True)
names_df.sort_values(['name'], inplace=True, ignore_index=True)
print(f'Number of unique names: {len(df_names):,}')

# output compiled dataset
names_df.to_csv(f'./data/unique_names_{start_year}_{end_year}.txt', header=None, sep=',', index=False)