## Building a DataFrame from Name Files

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline

def build_names_df(years):
    columns = ['name', 'sex', 'births','year']
    ldfs = []
    for year in years:
        file_name = 'names/yob'+str(year)+'.txt'
        year_frame = pd.read_csv(file_name, names=columns)
        year_frame['year'] = year
        ldfs.append(year_frame)
    return(pd.concat(ldfs,ignore_index = True))

years = range(1880,2017)
df_names = build_names_df(years)
print(df_names.head())
print(df_names.tail())

## Some statistics

In [None]:
# Total of males and females 
mf = df_names.groupby('sex').births.sum()
print('Number of males:   ',mf.loc['M'])
print('Number of females: ',mf.loc['F'])

In [None]:
# Total of males and females per year
mfy = df_names.pivot_table(index='year',
                               columns='sex', values='births', aggfunc=sum)
print(mfy.head())
mfy.plot()

### Task 1: 
Compute the frequency (percentage) of each name per year. Insert the computed information as a new column into the dataframe.

In [None]:
#Solution
def comp_frac(group):
    group['frac'] = group.births/group.births.sum()
    return(group)
    
df_names = df_names.groupby('year').apply(comp_frac)
print(df_names.head())

### Task 2: 
Print the k most frequent names in each year with their percentage. For instance, for $k=3$, the output should be something like:

```
1880
John   William   Mary
0.048   0.047   0.035
```
Your print must be formated as above.

In [None]:
#Solution
def get_topn(group,nc=3):
    return(group.sort_values(by='frac',ascending=False)[:nc])

n=5
topn = df_names.groupby('year').apply(get_topn,nc=n)
for g in topn.index.levels[0][:5]:
    print('\n\n',g)
    for i in range(n):
        print(topn.loc[g]['name'].iloc[i],' ',end='')
    print('')
    for i in range(n):
        print('%.3f'%topn.loc[g]['frac'].iloc[i],' ',end='')

### Task 3:

Compute the k most frequent names considering **sum of frenquences** along the years and make a plot of their variation over the years.

In [None]:
#Solution
k = 5
name_year = df_names.pivot_table(index='year',columns='name',values='frac',aggfunc=sum)
topk_ever = name_year.sum(axis=0).sort_values(ascending=False)
name_year[topk_ever.index[0:k]].plot(title='Most Frequent Names');


### Task 4: 

Which gender has more diversity of names considering the whole set of years?

In [None]:
#Solution

df_total_diversity = df_names.groupby(['sex','name']).apply(lambda x: 1)
male_total_diversity = df_total_diversity.xs('M').size
print('Diversity in Male names: ',male_total_diversity)
female_total_diversity = df_total_diversity.xs('F').size
print('Diversity in Female names: ',female_total_diversity)

### Task 5:

Make a plot to analyze how the diversity of names evolves over time for both genders.

In [None]:
# Solution

df_diversity_year = df_names.groupby(['year','sex']).size()
print(df_diversity_year.head())
df_diversity_year.unstack().plot(title='Diversity of Genders');

### Task 6: 

How many of the names shows up in both genders? Among those "bi_gender" names, which are the most frequent ones?

In [None]:
# Solution
def is_bi_gender_name(group):
    if (len(set(group['sex'])) != 1):
       return(group['births'].sum())

#print(df_names.loc[df_names['name']=='William'])
bi_gender_names = df_names.groupby(['name']).apply(is_bi_gender_name)
bi_gender_names.dropna(inplace=True)
print('Number of bi-gender names: ',bi_gender_names.size)

bi_gender_names.sort_values(inplace=True,ascending=False)
k = 5 # k most frequent bi-gender names 
print(bi_gender_names.head())


### Task 7:

Plot the __least__ frequent bi-gender names over time.

In [None]:
df_bgnkmost = df_names.loc[df_names['name'].isin(bi_gender_names.tail().index)]

bgnkmost_table = df_bgnkmost.pivot_table(index='year',columns='name',values='births',aggfunc=sum)
bgnkmost_table.plot();
print(bgnkmost_table)