In [1]:
import pandas as pd
import csv

## Read in the CSV files

In [2]:
# Mapping lets you format an entire DataFrame
file = "data.csv"
name_df = pd.read_csv(file)
name_df.head()

Unnamed: 0,State,Gender,Year,Name,Occurence
0,MN,F,1910,Mary,216
1,MN,F,1910,Helen,201
2,MN,F,1910,Margaret,184
3,MN,F,1910,Dorothy,163
4,MN,F,1910,Ruth,136


In [3]:
# Mapping lets you format an entire DataFrame
state_file = "state_lat_longs.csv"
state_df = pd.read_csv(state_file)
state_df.head()

Unnamed: 0,State,State_full,lat,log
0,AL,Alabama,32.806671,-86.79113
1,AK,Alaska,61.370716,-152.404419
2,AZ,Arizona,33.729759,-111.431221
3,AR,Arkansas,34.969704,-92.373123
4,CA,California,36.116203,-119.681564


## Merged the CSV files

In [4]:
#Merging Data
combined_data = pd.merge(name_df, state_df, on="State")
combined_data.reset_index(drop=True)
combined_data.head()

Unnamed: 0,State,Gender,Year,Name,Occurence,State_full,lat,log
0,MN,F,1910,Mary,216,Minnesota,45.694454,-93.900192
1,MN,F,1910,Helen,201,Minnesota,45.694454,-93.900192
2,MN,F,1910,Margaret,184,Minnesota,45.694454,-93.900192
3,MN,F,1910,Dorothy,163,Minnesota,45.694454,-93.900192
4,MN,F,1910,Ruth,136,Minnesota,45.694454,-93.900192


### Group the dataframe by State and Year

In [5]:
grouped_usa_df = combined_data.groupby(['State'])
grouped_usa_df.head()

Unnamed: 0,State,Gender,Year,Name,Occurence,State_full,lat,log
0,MN,F,1910,Mary,216,Minnesota,45.694454,-93.900192
1,MN,F,1910,Helen,201,Minnesota,45.694454,-93.900192
2,MN,F,1910,Margaret,184,Minnesota,45.694454,-93.900192
3,MN,F,1910,Dorothy,163,Minnesota,45.694454,-93.900192
4,MN,F,1910,Ruth,136,Minnesota,45.694454,-93.900192
111659,AZ,F,1910,Mary,74,Arizona,33.729759,-111.431221
111660,AZ,F,1910,Maria,29,Arizona,33.729759,-111.431221
111661,AZ,F,1910,Alice,27,Arizona,33.729759,-111.431221
111662,AZ,F,1910,Margaret,19,Arizona,33.729759,-111.431221
111663,AZ,F,1910,Helen,18,Arizona,33.729759,-111.431221


### Need to sort by top occurance


combined_data.groupby(['0'],['3']).nlargest(5)

occurance_df = pd.DataFrame(
df_agg["Occurence"].sort())
occurance_df.head(10)

### Grouped with lat and long

#### Grouped by state, year and total name apperence per year

In [6]:
#state_groups = picked_by_staff.groupby("state")
df_agg = combined_data.groupby(['State','Year','Name']).sum()
df_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Occurence,lat,log
State,Year,Name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,1910,Anna,10,61.370716,-152.404419
AK,1910,Annie,12,61.370716,-152.404419
AK,1910,Carl,5,61.370716,-152.404419
AK,1910,Dorothy,5,61.370716,-152.404419
AK,1910,Edward,5,61.370716,-152.404419


In [7]:
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Occurence,lat,log
State,Year,Name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,1910,Anna,10,61.370716,-152.404419
AK,1910,Annie,12,61.370716,-152.404419
AK,1910,Carl,5,61.370716,-152.404419
AK,1910,Dorothy,5,61.370716,-152.404419
AK,1910,Edward,5,61.370716,-152.404419
AK,1910,Elsie,6,61.370716,-152.404419
AK,1910,George,5,61.370716,-152.404419
AK,1910,Helen,7,61.370716,-152.404419
AK,1910,James,7,61.370716,-152.404419
AK,1910,John,8,61.370716,-152.404419


#### Grouped by Year and total name occurence

In [22]:
df2=df_agg.groupby(['Year']).apply(lambda x: x.sort_values(["Occurence"], ascending=[False]))
df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Occurence,lat,log
Year,State,Year,Name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1910,PA,1910,Mary,2922,81.181504,-154.41951
1910,NY,1910,Mary,1934,84.331452,-149.896102
1910,PA,1910,Helen,1609,81.181504,-154.41951
1910,PA,1910,Anna,1534,40.590752,-77.209755
1910,PA,1910,John,1326,40.590752,-77.209755


#### Grouped by state and total name occurence

In [23]:
sort_df = df_agg.sort_values(['Occurence'], ascending=[False])
sort_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Occurence,lat,log
State,Year,Name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NY,1947,Robert,10044,84.331452,-149.896102
NY,1947,John,9655,84.331452,-149.896102
NY,1946,Robert,9313,84.331452,-149.896102
NY,1963,Michael,9274,84.331452,-149.896102
NY,1952,Robert,9252,84.331452,-149.896102


## Need to export files as csv to import for mapping to geo website

In [None]:
# df.to_csv('name_.csv')

In [None]:
#df_agg.to_csv('name_year.csv')

### Need to export files as JSON to import for mapping to geo website

In [None]:
grouped_usa_df.to_json(orient='records')[1:-1].replace('},{', '} {')

In [None]:
with open('grouped_usa.txt', 'w') as f:
    f.write(out)

## Below is me experimenting with grouping

In [42]:
df2_agg = combined_data.groupby(['Name','State']).count()
df2_agg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Gender,Year,Occurence,State_full,lat,log
Name,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaban,NY,2,2,2,2,2,2
Aadan,CA,3,3,3,3,3,3
Aadan,TX,1,1,1,1,1,1
Aadarsh,IL,1,1,1,1,1,1
Aaden,AL,9,9,9,9,9,9


In [41]:
clust = combined_data.groupby(['Year','State','Name'], as_index=False)['Occurence'].count()
clust.head(10)

Unnamed: 0,Year,State,Name,Occurence
0,1910,AK,Anna,1
1,1910,AK,Annie,1
2,1910,AK,Carl,1
3,1910,AK,Dorothy,1
4,1910,AK,Edward,1
5,1910,AK,Elsie,1
6,1910,AK,George,1
7,1910,AK,Helen,1
8,1910,AK,James,1
9,1910,AK,John,1


In [39]:
df = clust.sort_values(['Year','State','Name'], ascending=[True, True,False]).groupby('Occurence').head(5)
df.head()

Unnamed: 0,Year,State,Name,Occurence
15,1910,AK,William,1
14,1910,AK,Robert,1
13,1910,AK,Paul,1
12,1910,AK,Mary,1
11,1910,AK,Margaret,1


In [None]:
#combined_data.groupby('Year')['Name'].nunique()
#summary_df = pd.DataFrame({