# 1880-2010年间全美婴儿姓名

美国社会保障总署（SSA）提供了一份从1880年到现在的婴儿名字频率数据。Hadley Wickham（许多流行R包的作者）经常用这份数据来演示R的数据处理功能`

可以用这个数据集做很多事，例如：

1. 计算指定名字（可以是你自己的，也可以是别人的）的年度比例。
2. 计算某个名字的相对排名。
3. 计算各年度最流行的名字，以及增长或减少最快的名字。
4. 分析名字趋势：元音、辅音、长度、总体多样性、拼写变化、首尾字母等。
5. 分析外源性趋势：圣经中的名字、名人、人口结构变化等。

In [None]:
import pandas as pd
import numpy as np

## 先读入数据

In [None]:
!ls dataset/babynames

In [None]:
pd_1880=pd.read_csv("dataset/babynames/yob1880.txt",names=['name','sex','number'])

In [None]:
pd_1880.head(10)

##### 统计不同性别的名字数量

In [None]:
pd_1880.groupby('sex').sum()

## 将不同年份的数据进行合并

In [None]:
#增加一列year
pieces=[]
for year in range(1880,2011):
    path="dataset/babynames/yob{}.txt".format(year)
    frame=pd.read_csv(path,names=['name','sex','number'])
    frame['year']=year
    pieces.append(frame)


In [None]:
# Concatenate everything into a single DataFrame
all_names = pd.concat(pieces, ignore_index=True)

In [None]:
all_names.info()

In [None]:
#求不同年份的不同性别的出生人数分布
total_births = all_names.pivot_table('number', index='year',columns='sex', aggfunc=sum)

In [None]:
total_births.head(10)

In [None]:
total_births.plot(title='Total births by sex and year')

### 计算不同姓名的人数占当年总人数的比例

In [None]:
def add_prop(group):
    group['prop'] = group.number / group.number.sum()
    return group
all_names = all_names.groupby(['year', 'sex']).apply(add_prop)

In [None]:
all_names.info()

In [None]:
all_names.head()

最后一列是否可以百分号展示

In [None]:
all_names['prop'].apply(lambda x:"{:.2%}".format(x))

In [None]:
all_names.head()

## 取出每年top1000的名字

In [None]:
pieces = []
for year, group in all_names.groupby(['year', 'sex']):
    pieces.append(group.sort_values(by='number', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)

In [None]:
top1000.head()

### 分析命名趋势

In [None]:
boys = top1000[top1000.sex == 'M']

In [None]:
girls = top1000[top1000.sex == 'F']

In [None]:
boys.head()

In [None]:
 total_births = top1000.pivot_table('number', index='year',\
                                      columns='name',\
                                  aggfunc=sum)

In [None]:
total_births.info()

In [None]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

In [None]:
subset.plot(subplots=True, figsize=(12, 10), grid=False,       title="Number of births per year")

## 评估命名多样性的增长

分性别统计的前1000个名字在总出生人数中的比例

In [None]:
table = top1000.pivot_table('prop', index='year',columns='sex', aggfunc=sum)

In [None]:
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
           

计算占总出生人数前50%的不同名字的数量

In [None]:
df = boys[boys.year == 2010]

prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)·

In [None]:
prop_cumsum

In [None]:
df = boys[boys.year == 1900]

prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)

In [None]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [None]:
diversity.head(20)

In [None]:
diversity.plot(title="Number of popular names in top 50%")

## 名字中最后一个字母的变化

In [None]:
get_last_letter = lambda x: x[-1]
last_letters = all_names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = all_names.pivot_table('number', index=last_letters,
                          columns=['sex', 'year'], aggfunc=sum)

In [None]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()

##### 计算出各性别各末字母占总出生人数的比例

In [None]:
letter_prop = subtable / subtable.sum()

In [None]:
letter_prop.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

In [None]:
letter_prop = table / table.sum()
dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T

In [None]:
dny_ts.head(10)

In [None]:
dny_ts.plot()

## 变成女孩名字的男孩名字（以及相反的情况）

In [None]:
all_names = pd.Series(top1000.name.unique())

In [None]:
all_names

In [None]:
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]

In [None]:
filtered = top1000[top1000.name.isin(lesley_like)]

In [None]:
filtered.groupby('name').number.sum()

In [None]:
table = filtered.pivot_table('number', index='year',\
                            columns='sex', aggfunc='sum')


In [None]:
table = table.div(table.sum(1), axis=0)

In [None]:
table.plot(style={'M': 'k-', 'F': 'k--'})
