## Downloading data

In [1]:
import os
import pandas as pd
from urllib.request import urlretrieve
from shutil import unpack_archive

Source URLs

In [2]:
national_url = "https://www.ssa.gov/oact/babynames/names.zip"
state_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
territory_url = "https://www.ssa.gov/oact/babynames/territory/namesbyterritory.zip"

Making folders

In [3]:
mkdir names namesbystate namesbyterritory

#### Unpacking national data

In [4]:
cd names

/home/daniel/Documents/Projects/popular_baby_names/names


In [5]:
urlretrieve(national_url, 'names.zip')
unpack_archive('names.zip')

Let's see some of what was unpacked

In [6]:
ls | head

[0m[01;31mnames.zip[0m
[00;32mNationalReadMe.pdf[0m
[00;32myob1880.txt[0m
[00;32myob1881.txt[0m
[00;32myob1882.txt[0m
[00;32myob1883.txt[0m
[00;32myob1884.txt[0m
[00;32myob1885.txt[0m
[00;32myob1886.txt[0m
[00;32myob1887.txt[0m


Let's see inside one of the text files:

In [7]:
%%bash
head yob1880.txt

Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
Sarah,F,1288


In [8]:
txt_files = [f for f in os.listdir() if 'txt' in f]

names = pd.DataFrame(columns=['name', 'sex', 'number', 'year'])

for txt in txt_files:
    temp = pd.read_csv(txt, names=['name', 'sex', 'number'])
    temp['year'] = txt[3:7]
    names = pd.concat([ names, temp ])

In [9]:
cd ..

/home/daniel/Documents/Projects/popular_baby_names


In [10]:
names.to_csv('names_nationaldata.csv', index=False)

#### Unpacking state data

In [11]:
cd namesbystate/

/home/daniel/Documents/Projects/popular_baby_names/namesbystate


In [12]:
urlretrieve(state_url, 'namesbystate.zip')
unpack_archive('namesbystate.zip')

In [13]:
ls | head

[0m[00;32mAK.TXT[0m
[00;32mAL.TXT[0m
[00;32mAR.TXT[0m
[00;32mAZ.TXT[0m
[00;32mCA.TXT[0m
[00;32mCO.TXT[0m
[00;32mCT.TXT[0m
[00;32mDC.TXT[0m
[00;32mDE.TXT[0m
[00;32mFL.TXT[0m


In [14]:
%%bash
head AK.TXT

AK,F,1910,Mary,14
AK,F,1910,Annie,12
AK,F,1910,Anna,10
AK,F,1910,Margaret,8
AK,F,1910,Helen,7
AK,F,1910,Elsie,6
AK,F,1910,Lucy,6
AK,F,1910,Dorothy,5
AK,F,1911,Mary,12
AK,F,1911,Margaret,7


In [15]:
txt_files = [f for f in os.listdir() if 'TXT' in f]

namesbystate = pd.DataFrame(columns=['state', 'sex',
                                     'year', 'name', 'number'])

for txt in txt_files:
    temp = pd.read_csv(txt, names=['state', 'sex', 'year',
                                   'name', 'number'])
    namesbystate = pd.concat([ namesbystate, temp ])

In [16]:
cd ..

/home/daniel/Documents/Projects/popular_baby_names


In [17]:
namesbystate.to_csv('names_statedata.csv', index=False)

#### Unpacking territory data

In [18]:
cd namesbyterritory/

/home/daniel/Documents/Projects/popular_baby_names/namesbyterritory


In [19]:
urlretrieve(territory_url, 'namesbyterritory.zip')
unpack_archive('namesbyterritory.zip')

In [20]:
ls | head

[0m[01;31mnamesbyterritory.zip[0m
[00;32mPR.TXT[0m
[00;32mTerritoryReadMe.pdf[0m
[00;32mTR.TXT[0m


In [21]:
%%bash
head PR.TXT

PR,F,1998,Paola,724
PR,F,1998,Genesis,500
PR,F,1998,Gabriela,447
PR,F,1998,Nicole,392
PR,F,1998,Alondra,344
PR,F,1998,Maria,292
PR,F,1998,Nashaly,276
PR,F,1998,Stephanie,273
PR,F,1998,Andrea,256
PR,F,1998,Adriana,251


In [22]:
txt_files = [f for f in os.listdir() if 'TXT' in f]

namesbyterritory = pd.DataFrame(columns=['territory', 'sex',
                                     'year', 'name', 'number'])

for txt in txt_files:
    temp = pd.read_csv(txt, names=['territory', 'sex', 'year',
                                   'name', 'number'])
    namesbyterritory = pd.concat([ namesbyterritory, temp ])

In [23]:
cd ..

/home/daniel/Documents/Projects/popular_baby_names


In [24]:
namesbyterritory.to_csv('names_territorydata.csv', index=False)