## Grouping and MultiIndexing

This notebook illustrates some methods for grouping the dataset. Multi-indexing methods are also added after groupby. 
<br>The dataset used in this notebook is included in this repository.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/countries.csv')
df.head()

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [3]:
df.groupby('Continent').count()

Unnamed: 0_level_0,Country
Continent,Unnamed: 1_level_1
Africa,54
Asia,44
Europe,47
North America,23
Oceania,14
South America,12


In [4]:
df = pd.read_csv('census.csv', usecols=['REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2012', 'POPESTIMATE2013'])
df.head()

Unnamed: 0,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2012,POPESTIMATE2013
0,3,6,1,0,Alabama,Alabama,4779736,4780127,4816089,4830533
1,3,6,1,1,Alabama,Autauga County,54571,54571,55175,55038
2,3,6,1,3,Alabama,Baldwin County,182265,182265,190396,195126
3,3,6,1,5,Alabama,Barbour County,27457,27457,27159,26973
4,3,6,1,7,Alabama,Bibb County,22915,22919,22642,22512


In [5]:
df.columns

Index(['REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME',
       'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2012',
       'POPESTIMATE2013'],
      dtype='object')

In [6]:
df.groupby('STNAME').max().iloc[:,4:]

Unnamed: 0_level_0,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2012,POPESTIMATE2013
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,Winston County,4779736,4780127,4816089,4830533
Alaska,Yukon-Koyukuk Census Area,710231,710249,731228,737442
Arizona,Yuma County,6392017,6392307,6553262,6630799
Arkansas,Yell County,2915918,2915958,2949499,2957957
California,Yuba County,37253956,37254503,38056055,38414128
Colorado,Yuma County,5029196,5029324,5191731,5271132
Connecticut,Windham County,3574097,3574118,3593541,3597168
Delaware,Sussex County,897934,897936,917099,925353
District of Columbia,District of Columbia,601723,601767,635342,649540
Florida,Washington County,18801310,18804623,19352021,19594467


In [7]:
df.groupby('STNAME').describe()['CENSUS2010POP']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,68.0,140580.5,580183.9,9045.0,19204.75,36052.0,82413.75,4779736.0
Alaska,30.0,47348.73,137107.7,662.0,2666.25,7244.0,13563.25,710231.0
Arizona,16.0,799002.1,1760640.0,8437.0,52052.75,132883.5,252217.25,6392017.0
Arkansas,76.0,76734.68,334782.8,5368.0,12854.5,19840.5,41544.5,2915918.0
California,59.0,1262846.0,4969099.0,1175.0,50423.5,181058.0,701878.5,37253956.0
Colorado,65.0,154744.5,633504.2,699.0,5823.0,15324.0,46824.0,5029196.0
Connecticut,9.0,794243.8,1098660.0,118428.0,165676.0,274055.0,894014.0,3574097.0
Delaware,4.0,448967.0,344078.6,162310.0,188436.25,367812.0,628342.75,897934.0
District of Columbia,2.0,601723.0,0.0,601723.0,601723.0,601723.0,601723.0,601723.0
Florida,68.0,552979.7,2289124.0,8365.0,28322.75,118407.0,321848.25,18801310.0


### MultiIndexing

In [8]:
df.head()

Unnamed: 0,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2012,POPESTIMATE2013
0,3,6,1,0,Alabama,Alabama,4779736,4780127,4816089,4830533
1,3,6,1,1,Alabama,Autauga County,54571,54571,55175,55038
2,3,6,1,3,Alabama,Baldwin County,182265,182265,190396,195126
3,3,6,1,5,Alabama,Barbour County,27457,27457,27159,26973
4,3,6,1,7,Alabama,Bibb County,22915,22919,22642,22512


In [9]:
# Index Levels
state = df['STNAME']
city = df['CTYNAME']
hier_index = list(zip(state, city))
hier_index = pd.MultiIndex.from_tuples(hier_index)
df.index = hier_index

In [10]:
df.drop(['STNAME', 'CTYNAME','COUNTY'], axis=1)

Unnamed: 0,Unnamed: 1,REGION,DIVISION,STATE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2012,POPESTIMATE2013
Alabama,Alabama,3,6,1,4779736,4780127,4816089,4830533
Alabama,Autauga County,3,6,1,54571,54571,55175,55038
Alabama,Baldwin County,3,6,1,182265,182265,190396,195126
Alabama,Barbour County,3,6,1,27457,27457,27159,26973
Alabama,Bibb County,3,6,1,22915,22919,22642,22512
Alabama,Blount County,3,6,1,57322,57322,57776,57734
Alabama,Bullock County,3,6,1,10914,10915,10606,10628
Alabama,Butler County,3,6,1,20947,20946,20408,20261
Alabama,Calhoun County,3,6,1,118572,118586,117286,116575
Alabama,Chambers County,3,6,1,34215,34170,34075,34153


In [11]:
df.loc[['Alabama']]

Unnamed: 0,Unnamed: 1,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2012,POPESTIMATE2013
Alabama,Alabama,3,6,1,0,Alabama,Alabama,4779736,4780127,4816089,4830533
Alabama,Autauga County,3,6,1,1,Alabama,Autauga County,54571,54571,55175,55038
Alabama,Baldwin County,3,6,1,3,Alabama,Baldwin County,182265,182265,190396,195126
Alabama,Barbour County,3,6,1,5,Alabama,Barbour County,27457,27457,27159,26973
Alabama,Bibb County,3,6,1,7,Alabama,Bibb County,22915,22919,22642,22512
Alabama,Blount County,3,6,1,9,Alabama,Blount County,57322,57322,57776,57734
Alabama,Bullock County,3,6,1,11,Alabama,Bullock County,10914,10915,10606,10628
Alabama,Butler County,3,6,1,13,Alabama,Butler County,20947,20946,20408,20261
Alabama,Calhoun County,3,6,1,15,Alabama,Calhoun County,118572,118586,117286,116575
Alabama,Chambers County,3,6,1,17,Alabama,Chambers County,34215,34170,34075,34153
