# Grouped DataFrames

In [1]:
# Grouping in Pandas

# Pandas pakage also supports variety of grouping, sub-setting, aggregation functions.

# Similar to dplyr in R

In [2]:
import pandas as pd

In [4]:
# use pd.DataFrame() to construct a dataframe

df = pd.DataFrame({"iq": [75, 4, 55, 63, 44, 65, 5, 6],
                    "age": [2, 33, 4, 12, 8, 4, 7, 9],
                    "animal": ["llama", "shoe", "cat", "llama", "cat", "llama", "cat", "shoe"]})

In [5]:
df.groupby("animal").count()    # take the count by group

Unnamed: 0_level_0,iq,age
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,3,3
llama,3,3
shoe,2,2


In [6]:
df.groupby("animal").mean()  # take the mean by group

Unnamed: 0_level_0,iq,age
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,34.666667,6.333333
llama,67.666667,6.0
shoe,5.0,21.0


In [7]:
df.groupby("animal").describe()   # more verbose summary

Unnamed: 0_level_0,iq,iq,iq,iq,iq,iq,iq,iq,age,age,age,age,age,age,age,age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
animal,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
cat,3.0,34.666667,26.274195,5.0,24.5,44.0,49.5,55.0,3.0,6.333333,2.081666,4.0,5.5,7.0,7.5,8.0
llama,3.0,67.666667,6.429101,63.0,64.0,65.0,70.0,75.0,3.0,6.0,5.291503,2.0,3.0,4.0,8.0,12.0
shoe,2.0,5.0,1.414214,4.0,4.5,5.0,5.5,6.0,2.0,21.0,16.970563,9.0,15.0,21.0,27.0,33.0


# Grouped DataFrame

In [8]:
grp_df = df.groupby("animal")  # save grouped dataframe object

In [9]:
grp_df.groups   # see the groups we have

{'cat': Int64Index([2, 4, 6], dtype='int64'),
 'llama': Int64Index([0, 3, 5], dtype='int64'),
 'shoe': Int64Index([1, 7], dtype='int64')}

In [11]:
grp_df.get_group("cat") # get dataframe of single group

Unnamed: 0,iq,age,animal
2,55,4,cat
4,44,8,cat
6,5,7,cat


# Chaining Methods in Python

In [15]:
# A Method is a function that "belongs" to an object. A method is generally called using the following notation
# object.method()

In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)   # set seed for reproducibility 

df = pd.DataFrame({"iq": np.random.normal(100, 20, 10),
                  "age": [2, 33, 4, 12, 8, 4, 7, 8, 12, 9],
                "animal": ["llama", "shoe", "cat", "llama", "cat", "llama", "cat", "shoe", "cat", "llama"]})

In [17]:
# Chaining Methods

In [18]:
df.groupby("animal").mean()   #groupby() method chained with mean()

Unnamed: 0_level_0,iq,age
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,107.616368,7.75
llama,111.640835,6.75
shoe,106.291704,20.5


In [19]:
# Chaining multiple methods

In [20]:
df.groupby("animal").mean().sort_values(by = "iq")

Unnamed: 0_level_0,iq,age
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
shoe,106.291704,20.5
cat,107.616368,7.75
llama,111.640835,6.75
