## names.readme

Baby name frequency data from https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data. This website describes the data as "Public: This dataset is intended for public access and use."

Dataset description from http://www.ssa.gov/oact/babynames/background.html:

"All names are from Social Security card applications for births that occurred in the United States after 1879. Note that many people born before 1937 never applied for a Social Security card, so their names are not included in our data. For others who did apply, our records may not show the place of birth, and again their names are not included in our data."

And further:

"""
People using our data on popular names are urged to explicitly acknowledge the following qualifications.

* Names are restricted to cases where the year of birth, sex, State of birth (50 States and District of Columbia) are on record, and where the given name is at least 2 characters long.
* Name data are not edited. For example, the sex associated with a name may be incorrect. Entries such as "Unknown" and "Baby" are not removed from the lists.
* Different spellings of similar names are not combined. For example, the names Caitlin, Caitlyn, Kaitlin, Kaitlyn, Kaitlynn, Katelyn, and Katelynn are considered separate names and each has its own rank.
* When two different names are tied with the same frequency for a given year of birth, we break the tie by assigning rank in alphabetical order.
* Some names are applied to both males and females (for example, Micah). Our rankings are done by sex, so that a name such as Micah will have a different rank for males as compared to females.
"""

## download names.zip
```
ureq.urlretrieve('https://www.ssa.gov/oact/babynames/names.zip', 'names.zip')
```

In [None]:
import numpy as np
import matplotlib.pyplot as pp
import pandas as pd
import seaborn
import urllib.request as ureq
import sys
print('numpy version: {}'.format(np.__version__))
print('matplotlib version: {}'.format(sys.modules[pp.__package__].__version__))
print('pandas version: {}'.format(pd.__version__))
print('seaborn version: {}'.format(seaborn.__version__))
print('urllib version: {}'.format(ureq.__version__))

In [None]:
%matplotlib inline

In [None]:
#### download names.zip and extract
import os.path
import zipfile
if not os.path.isfile('names.zip'):
    ureq.urlretrieve('https://www.ssa.gov/oact/babynames/names.zip','names.zip')
if not os.path.isfile('names/yob1880.txt'):
    zipfile.ZipFile('names.zip').extractall('names')

In [None]:
import os, re
# AttributeError: __enter__
'''
with os.listdir('names') as files:
    oldest = files[0]
    latest = files[0]
    for f in files:
        if re.search('\\.txt$', f):
            if f > latest: latest = f
            elif f < oldest: oldest = f
    oldest, latest
'''
def getyearrange(namesdir):
    files = os.listdir('names')
    oldest = files[0]
    latest = files[0]
    for f in files:
        if re.search('\\.txt$', f):
            if f > latest: latest = f
            elif f < oldest: oldest = f
    return oldest, latest

getyearrange('names')

In [None]:
open('names/yob2011.txt','r').readlines()[:10]

In [None]:
names2011 = pd.read_csv('names/yob2011.txt')

In [None]:
names2011.head()

In [None]:
names2011 = pd.read_csv('names/yob2011.txt',names=['name','sex','number'])

In [None]:
names2011.head()

In [None]:
names_all = []

for year in range(1880, 2018 + 1):
    names_all.append(pd.read_csv('names/yob{}.txt'.format(year),names=['name','sex','number']))
    names_all[-1]['year'] = year

print(names_all[0])
allyears = pd.concat(names_all)

In [None]:
allyears.head()

In [None]:
allyears.tail()

In [None]:
allyears_indexed = allyears.set_index(['sex','name','year']).sort_index()
allyears_indexed

In [None]:
allyears_indexed.loc['F','Mary']

In [None]:
def plotname(sex,name):
    data = allyears_indexed.loc[sex,name]
    
    pp.plot(data.index,data.values)

In [None]:
pp.figure(figsize=(12,2.5))

names = ['Michael','John','David','Martin']

for name in names:
    plotname('M',name)

pp.legend(names)

In [None]:
pp.figure(figsize=(12,2.5))

names = ['Emily','Anna','Claire','Elizabeth']

for name in names:
    plotname('F',name)

pp.legend(names)

In [None]:
pp.figure(figsize=(12,2.5))

names = ['Chiara','Claire','Clare','Clara','Ciara']

for name in names:
    plotname('F',name)

pp.legend(names)

In [None]:
allyears_indexed.loc['F'].loc[names].head()

In [None]:
allyears_indexed.loc['F'].loc[names].unstack(level=0).head()

In [None]:
allyears_indexed.loc['F'].loc[names].unstack(level=0).fillna(0).head()

In [None]:
variants = allyears_indexed.loc['F'].loc[names].unstack(level=0).fillna(0)

pp.figure(figsize=(12,2.5))

pp.stackplot(variants.index, variants.values.T, labels = names)

In [None]:
pp.figure(figsize=(12,2.5))

palette = seaborn.color_palette()
pp.stackplot(variants.index,variants.values.T,colors=palette)

for i,name in enumerate(names):
    pp.text(1882,5000 + 800*i,name,color=palette[i])

In [None]:
allyears_indexed.loc['M',:,2008].sort('number',ascending=False).head()

In [None]:
pop2008 = allyears_indexed.loc['M',:,2008].sort('number',ascending=False).head()

In [None]:
pop2008.reset_index().drop(['sex','year','number'],axis=1).head()

In [None]:
def topten(sex,year):
    simple = allyears_indexed.loc[sex,:,year].sort('number',ascending=False).reset_index()
    simple = simple.drop(['sex','year','number'],axis=1).head(10)
    
    simple.columns = [year]
    simple.index = simple.index + 1
    
    return simple

In [None]:
topten('M',2009)

In [None]:
def toptens(sex,year0,year1):
    years = [topten(sex,year) for year in range(year0,year1+1)]
    
    return years[0].join(years[1:])

In [None]:
toptens('M',2000,2010)

In [None]:
toptens('F',1985,1995)

In [None]:
toptens('F',1985,1995).stack().head()

In [None]:
toptens('F',1985,1995).stack().value_counts()

In [None]:
popular = toptens('F',1985,1995).stack().value_counts().index[:6]

In [None]:
pp.figure(figsize=(12,2.5))

for name in popular:
    plotname('F',name)
    
pp.legend(popular)

In [None]:
allyears.groupby(['sex','name']).sum().head()

In [None]:
allyears.groupby(['sex','name'])['number'].sum().head()

In [None]:
totals = allyears.groupby(['sex','name'])['number'].sum()

In [None]:
def sumsq(x):
    return sum(x**2)

In [None]:
spikyness = allyears.groupby(['sex','name'])['number'].agg(sumsq) / totals**2

In [None]:
spikyness.head()

In [None]:
spiky_common = spikyness[totals > 5000].copy()
spiky_common.sort(ascending=False)
spiky_common.head(10)

In [None]:
spiky_common.tail(5)

In [None]:
pp.figure(figsize=(12,2.5))

plotname('F','Louisa')
plotname('M','Shaquille')

In [None]:
fads = spiky_common.head(10).index.values

In [None]:
pp.figure(figsize=(12,2.5))

for sex,name in fads:
    plotname(sex,name)

pp.legend([name for sex,name in fads],loc='upper left')

In [None]:
totals_recent = allyears[allyears['year'] > 2005].groupby(['sex','name'])['number'].sum()

In [None]:
spiky_common = spikyness[(totals > 5000) & (totals_recent < 1000)].copy()
spiky_common.sort(ascending=False)
spiky_common.head(10)

In [None]:
fads = spiky_common.head(10).index.values

In [None]:
pp.figure(figsize=(12,2.5))

for sex,name in fads:
    plotname(sex,name)

pp.legend([name for sex,name in fads],loc='upper left')

In [None]:
allyears.groupby(['sex', 'name'])['number'].sum()

In [None]:
girlnametotal = allyears_indexed.loc['F', :, :].sum(level = 1).reset_index().copy()
girlnametotal.columns = ['name', 'girls']
girlnametotal.sort_values(by = 'girls', ascending = False).head(15)

In [None]:
boynametotal = allyears_indexed.loc['M', :, :].sum(level = 1).reset_index().copy()
boynametotal.columns = ['name', 'boys']
boynametotal.sort_values(by = 'boys', ascending = False).head(15)

In [None]:
# uniname = boynametotal.join(girlnametotal, on = ['name'], how = 'inner', right_index = False)
uniname = pd.merge(boynametotal, girlnametotal, on = 'name', how = 'inner')
uniname.sort_values(by = ['girls', 'boys'], ascending = False).head(15)

In [None]:
uninamenear = uniname[(uniname['boys'] * 4 > uniname['girls']) & (uniname['girls'] * 4 > uniname['boys'])].copy()
uninamenear.sort_values(by = ['girls', 'boys'], ascending = False).head(20)

## Official Solution

In [None]:
totals = allyears.groupby(['sex','name'])['number'].sum()

totals_bysex = totals.unstack('sex')
totals_bysex.head()

In [None]:
totals_both = totals_bysex.sum(axis = 1)

totals_both.head()

In [None]:
ratio = totals_bysex['F'] / totals_bysex['M']

unisex = (ratio > 0.25) & (ratio < 4)

ratio.head()

In [None]:
totals_both = totals_both[unisex]
totals_both.sort_values(ascending=False)
totals_both.head(5)

In [None]:
names = totals_both.head(10).index.values

pp.figure(figsize=(12,12))

for i,name in enumerate(names):
    pp.subplot(5,2,i+1)
    
    plotname('M',name)
    plotname('F',name)
    
    pp.legend([name + ' M',name + ' F'],loc='upper left')