 # Project 7: Explanatory Data Analysis & Advanced Visualization (Baby Names Dataset)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

### The most popular names

In [7]:
df = pd.read_csv('us_baby_names.csv')

In [8]:
df.columns

Index(['Year', 'Name', 'Gender', 'Count'], dtype='object')

This is the non-unique values in each column

In [6]:
df.nunique()

Year        139
Name      98400
Gender        2
Count     13720
dtype: int64

Converting gender as categorical type

In [63]:
df['Gender'] = df['Gender'].astype('category')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1957046 entries, 0 to 1957045
Data columns (total 4 columns):
 #   Column  Dtype   
---  ------  -----   
 0   Year    int64   
 1   Name    object  
 2   Gender  category
 3   Count   int64   
dtypes: category(1), int64(2), object(1)
memory usage: 46.7+ MB


Dataframe with names of 2018

In [10]:
df_names_2018 = df[df['Year'] == 2018].copy()

The most counted names

In [15]:
df_names_2018.nlargest(10,'Count')

Unnamed: 0,Year,Name,Gender,Count
1943042,2018,Liam,M,19837
1925013,2018,Emma,F,18688
1943043,2018,Noah,M,18267
1925014,2018,Olivia,F,17921
1925015,2018,Ava,F,14924
1943044,2018,William,M,14516
1925016,2018,Isabella,F,14464
1925017,2018,Sophia,F,13928
1943045,2018,James,M,13525
1943046,2018,Oliver,M,13389


The most counted female names

In [16]:
df_names_2018[df_names_2018['Gender'] == 'F'].nlargest(10,'Count')

Unnamed: 0,Year,Name,Gender,Count
1925013,2018,Emma,F,18688
1925014,2018,Olivia,F,17921
1925015,2018,Ava,F,14924
1925016,2018,Isabella,F,14464
1925017,2018,Sophia,F,13928
1925018,2018,Charlotte,F,12940
1925019,2018,Mia,F,12642
1925020,2018,Amelia,F,12301
1925021,2018,Harper,F,10582
1925022,2018,Evelyn,F,10376


The most counted male names

In [17]:
df_names_2018[df_names_2018['Gender'] == 'M'].nlargest(10,'Count')

Unnamed: 0,Year,Name,Gender,Count
1943042,2018,Liam,M,19837
1943043,2018,Noah,M,18267
1943044,2018,William,M,14516
1943045,2018,James,M,13525
1943046,2018,Oliver,M,13389
1943047,2018,Benjamin,M,13381
1943048,2018,Elijah,M,12886
1943049,2018,Lucas,M,12585
1943050,2018,Mason,M,12435
1943051,2018,Logan,M,12352


Defining a method to automatize this 

In [18]:
def most_pop(year,gender,n):
    conditions = True
    conditions &= df['Year']   == year
    conditions &= df['Gender'] == gender
    return df[conditions].nlargest(n,'Count')

In [42]:
most_pop(2014,'M',20)

Unnamed: 0,Year,Name,Gender,Count
1812240,2014,Noah,M,19305
1812241,2014,Liam,M,18462
1812242,2014,Mason,M,17201
1812243,2014,Jacob,M,16883
1812244,2014,William,M,16820
1812245,2014,Ethan,M,15724
1812246,2014,Michael,M,15454
1812247,2014,Alexander,M,15414
1812248,2014,James,M,14431
1812249,2014,Daniel,M,13937


## Evergreen Names

### Female

In [45]:
df_f1880 = most_pop(1880,'F',20)
df_f2018 = most_pop(2018,'F',20)

In [52]:
df_feg = df_f2018[['Name','Count']].merge(df_f1880[['Name','Count']],
                                how = 'inner',
                                on  = 'Name',
                                suffixes=('_2018','_1880'))

In [53]:
df_feg


Unnamed: 0,Name,Count_2018,Count_1880
0,Emma,18688,2003
1,Elizabeth,8513,1939
2,Ella,8055,1156


### Female

In [55]:
df_m1880 = most_pop(1880,'M',20)
df_m2018 = most_pop(2018,'M',20)

df_meg = df_m2018[['Name','Count']].merge(df_m1880[['Name','Count']],
                                how = 'inner',
                                on  = 'Name',
                                suffixes=('_2018','_1880'))

In [56]:
df_meg

Unnamed: 0,Name,Count_2018,Count_1880
0,William,14516,9532
1,James,13525,5927
2,Henry,10649,2444


## Evergreen Names

In [71]:
df = pd.read_csv('us_baby_names.csv')
df

Unnamed: 0,Year,Name,Gender,Count
0,1880,Mary,F,7065
1,1880,Anna,F,2604
2,1880,Emma,F,2003
3,1880,Elizabeth,F,1939
4,1880,Minnie,F,1746
...,...,...,...,...
1957041,2018,Zylas,M,5
1957042,2018,Zyran,M,5
1957043,2018,Zyrie,M,5
1957044,2018,Zyron,M,5


In [77]:
df.groupby(["Name","Gender"])['Count'].sum().reset_index(level=-1)

Unnamed: 0_level_0,Gender,Count
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaban,M,114
Aabha,F,35
Aabid,M,16
Aabidah,F,5
Aabir,M,10
...,...,...
Zyvion,M,5
Zyvon,M,7
Zyyanna,F,6
Zyyon,M,6


In [86]:
agg = df.groupby(['Name','Gender']).agg(
    Total     = ('Count','sum'),
    Fist_Year = ('Year','min'),
    Last_Year = ('Year','max'),
    Max_Count = ('Count','max')
)

In [87]:
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Fist_Year,Last_Year,Max_Count
Name,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aaban,M,114,2007,2018,16
Aabha,F,35,2011,2016,9
Aabid,M,16,2003,2018,6
Aabidah,F,5,2018,2018,5
Aabir,M,10,2016,2018,5
...,...,...,...,...,...
Zyvion,M,5,2009,2009,5
Zyvon,M,7,2015,2015,7
Zyyanna,F,6,2010,2010,6
Zyyon,M,6,2014,2014,6


In [97]:
# Filtering one name
df_test = df[df['Name'] == 'Aaban']
df_test

Unnamed: 0,Year,Name,Gender,Count
1586216,2007,Aaban,M,5
1654500,2009,Aaban,M,6
1685983,2010,Aaban,M,9
1718670,2011,Aaban,M,11
1752571,2012,Aaban,M,11
1784927,2013,Aaban,M,14
1817711,2014,Aaban,M,16
1851113,2015,Aaban,M,15
1886382,2016,Aaban,M,9
1917868,2017,Aaban,M,11


In [98]:
# Getting the best year
df_test.nlargest(1,'Count')['Year']

1817711    2014
Name: Year, dtype: int64

Now, lets implement a method to do this

In [96]:
def best_year(group):
    return group.nlargest(1,'Count').Year

best_year(df_test)

1817711    2014
Name: Year, dtype: int64

And apply this to de groupby object

In [99]:
best_y = df.groupby(['Name','Gender']).apply(best_year)

KeyboardInterrupt: 

In [None]:
best_y