In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [2]:
#Let's make a dframe (table)
# Construct using a dictionary
# Start with column 'K1' (first key) with a list of letters
# Start with column 'K2' (second key) with a list of letters
# Two keys for the two data sets
# Create two data sets of five random numbers
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.014706,-2.386859,X,alpha
1,0.018785,-0.414254,X,beta
2,0.293697,0.040845,Y,alpha
3,0.667928,-1.518187,Y,beta
4,0.138655,0.427952,Z,alpha


In [4]:
#Now let's see how to use groupby

#Lets grab the dataset1 column and group it by the k1 key
group1 = dframe['dataset1'].groupby(dframe['k1'])

#Show the groupby object
# Get the confirmation but not see it
group1

<pandas.core.groupby.SeriesGroupBy object at 0x1068edcd0>

In [6]:
#Now we can perform operations on this particular group
# This will show the dataset
group1.mean()

k1
X    0.002039
Y    0.480813
Z    0.138655
Name: dataset1, dtype: float64

In [7]:
# We can use group keys that are series as well

#For example:

#We'll make some arrays for use as keys
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['JAN','FEB','JAN','FEB','JAN'])

#Now using the data from dataset1, group the means by city and month
# line up cities then month
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB    0.018785
    JAN    0.293697
NY  FEB    0.667928
    JAN    0.061975
Name: dataset1, dtype: float64

In [8]:
# let's see the original dframe again.
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.014706,-2.386859,X,alpha
1,0.018785,-0.414254,X,beta
2,0.293697,0.040845,Y,alpha
3,0.667928,-1.518187,Y,beta
4,0.138655,0.427952,Z,alpha


In [9]:
# WE can also pass column names as group keys
# Grouped the x's, y's then z then grabbed the means
# Average of 
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,0.002039,-1.400556
Y,0.480813,-0.738671
Z,0.138655,0.427952


In [10]:
# Or multiple column names
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.014706,-2.386859
X,beta,0.018785,-0.414254
Y,alpha,0.293697,0.040845
Y,beta,0.667928,-1.518187
Z,alpha,0.138655,0.427952


In [11]:
# Another useful groupby method is getting the group sizes
# Shows how many instances of x,y and z for key 1
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [12]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print "This is the %s group" %name
    print group
    print '\n'

This is the X group
   dataset1  dataset2 k1     k2
0 -0.014706 -2.386859  X  alpha
1  0.018785 -0.414254  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.293697  0.040845  Y  alpha
3  0.667928 -1.518187  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4  0.138655  0.427952  Z  alpha




In [14]:
# We can also iterate with multiple keys
# Print the group
# Print a new line
for (k1,k2) , group in dframe.groupby(['k1','k2']):
    print "Key1 = %s Key2 = %s" %(k1,k2)
    print group
    print '\n'

Key1 = X Key2 = alpha
   dataset1  dataset2 k1     k2
0 -0.014706 -2.386859  X  alpha


Key1 = X Key2 = beta
   dataset1  dataset2 k1    k2
1  0.018785 -0.414254  X  beta


Key1 = Y Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.293697  0.040845  Y  alpha


Key1 = Y Key2 = beta
   dataset1  dataset2 k1    k2
3  0.667928 -1.518187  Y  beta


Key1 = Z Key2 = alpha
   dataset1  dataset2 k1     k2
4  0.138655  0.427952  Z  alpha




In [16]:
# A possibly useful tactic is creating a dictionary of the data pieces
# Create a dictionary using dict
# Pass a list from dframe using column k1
group_dict = dict(list(dframe.groupby('k1')))

#Show the group with X
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.014706,-2.386859,X,alpha
1,0.018785,-0.414254,X,beta


In [17]:
# We could have also chosen to do this with axis = 1
# Separate the dframe by the type of data
# Use groupby to separate numebers and strings data types

# Let's creat a dictionary for dtypes of objects!
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

#show
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.014706 -2.386859
 1  0.018785 -0.414254
 2  0.293697  0.040845
 3  0.667928 -1.518187
 4  0.138655  0.427952, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [18]:
# Next we'll learn how to use groupby with columns

In [19]:
# For example if we only wanted to group
# the dataset2 column with both sets of keys
# Pass both keys to it
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,-2.386859
X,beta,-0.414254
Y,alpha,0.040845
Y,beta,-1.518187
Z,alpha,0.427952
