# Mini Python Bonus 3
How to work with plain text data using `Pandas`

### What's is our toy dataset?

In [None]:
import os

In [None]:
cwd = os.getcwd()

In [None]:
cwd

In [None]:
root_dir = os.path.dirname(os.path.dirname(os.path.dirname(cwd)))
babynames_path = os.path.join(root_dir, 'datasets', 'babynames')
os.path.join(babynames_path, 'yob2016.txt')

In [None]:
# command line tool
!head -n 10 /Users/jin/minipy/datasets/babynames/yob2016.txt

In [None]:
!grep "Alice" /Users/jin/minipy/datasets/babynames/yob2016.txt

In [2]:
import pandas as pd

baby2016 = pd.read_csv("/Users/jin/minipy/datasets/babynames/yob2016.txt",
                           names=['name', 'gender', 'nbirth'])

baby2016.head()

Unnamed: 0,name,gender,nbirth
0,Emma,F,19414
1,Olivia,F,19246
2,Ava,F,16237
3,Sophia,F,16070
4,Isabella,F,14722


In [11]:
# let's have a look top 5 names for girls and boys
baby2016.sort_values(['gender','nbirth'], ascending=False).groupby('gender').head(5)

Unnamed: 0,name,gender,nbirth
18757,Noah,M,19015
18758,Liam,M,18138
18759,William,M,15668
18760,Mason,M,15192
18761,James,M,14776
0,Emma,F,19414
1,Olivia,F,19246
2,Ava,F,16237
3,Sophia,F,16070
4,Isabella,F,14722


### Concat all the files into one dataset

In [None]:
sub_df = []
columns = ['name', 'gender', 'nbirth']

for year in range(1880, 2017):
    path = "/Users/jin/minipy/datasets/babynames/yob{}.txt".format(year)
    csv  = pd.read_csv(path, names=columns)
    csv['year']=year
    
    sub_df.append(csv)
    
    df = pd.concat(sub_df, ignore_index=True)

In [None]:
df.info()

In [None]:
df.head()

### Play with the dataset

In [None]:
total_nbirth = df.pivot_table('nbirth', index='year', 
                              columns='gender', aggfunc=sum)

In [None]:
total_nbirth.head()

In [None]:
total_nbirth.plot(title="Total number of births, by gender and year")