In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,0.309966,-0.734434,X,alpha
1,-0.567035,0.987704,X,beta
2,-0.128413,0.932463,Y,alpha
3,0.292247,0.889374,Y,beta
4,0.696003,0.367252,Z,alpha


In [3]:
group1 = dframe['dataset1'].groupby(dframe['k1'])

In [4]:
group1

<pandas.core.groupby.SeriesGroupBy object at 0x10ebf5a90>

In [5]:
group1.mean()

k1
X   -0.128534
Y    0.081917
Z    0.696003
Name: dataset1, dtype: float64

In [7]:
cities = np.array(['NY','LA','LA','NY','NY'])

month = np.array(['JAN','FEB','JAN','FEB','JAN'])

In [8]:
# Using the data from dataset1 group the means by city and month
dframe['dataset1'].groupby([cities,month]).mean()

LA  FEB    1.363261
    JAN    0.030641
NY  FEB   -0.145497
    JAN   -0.056705
Name: dataset1, dtype: float64

In [9]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.398006,2.536178,X,alpha
1,1.363261,-2.000787,X,beta
2,0.030641,-2.636798,Y,alpha
3,-0.145497,-1.235475,Y,beta
4,-1.511416,-0.860866,Z,alpha


In [10]:
# Passing column names as group keys
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,1.380633,0.267696
Y,-0.057428,-1.936137
Z,-1.511416,-0.860866


In [12]:
# Getting the mean for multiple columns
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,1.398006,2.536178
X,beta,1.363261,-2.000787
Y,alpha,0.030641,-2.636798
Y,beta,-0.145497,-1.235475
Z,alpha,-1.511416,-0.860866


In [13]:
dframe.groupby(['k1']).size()

k1
X    2
Y    2
Z    1
dtype: int64

In [14]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.398006,2.536178,X,alpha
1,1.363261,-2.000787,X,beta
2,0.030641,-2.636798,Y,alpha
3,-0.145497,-1.235475,Y,beta
4,-1.511416,-0.860866,Z,alpha


In [17]:
# Iterating over groups by a single key
for name,group in dframe.groupby('k1'):
    print("This is the {} group".format(name))
    print(group)
    print('\n')

This is the X group
   dataset1  dataset2 k1     k2
0  1.398006  2.536178  X  alpha
1  1.363261 -2.000787  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2  0.030641 -2.636798  Y  alpha
3 -0.145497 -1.235475  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -1.511416 -0.860866  Z  alpha




In [22]:
# Iterating over groups by multiple keys
for (k1,k2),group in dframe.groupby(['k1','k2']):
    print("Key1 = {} \tKey2 = {}".format(k1,k2))
    print(group)
    print('\n')

Key1 = X 	Key2 = alpha
   dataset1  dataset2 k1     k2
0  1.398006  2.536178  X  alpha


Key1 = X 	Key2 = beta
   dataset1  dataset2 k1    k2
1  1.363261 -2.000787  X  beta


Key1 = Y 	Key2 = alpha
   dataset1  dataset2 k1     k2
2  0.030641 -2.636798  Y  alpha


Key1 = Y 	Key2 = beta
   dataset1  dataset2 k1    k2
3 -0.145497 -1.235475  Y  beta


Key1 = Z 	Key2 = alpha
   dataset1  dataset2 k1     k2
4 -1.511416 -0.860866  Z  alpha




In [20]:
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.398006,2.536178,X,alpha
1,1.363261,-2.000787,X,beta
2,0.030641,-2.636798,Y,alpha
3,-0.145497,-1.235475,Y,beta
4,-1.511416,-0.860866,Z,alpha


In [24]:
# Creating a dictionary of the object pieces so now you can search for any rows in the dataset 
# that has the value you specify in the column you grouped by.
group_dict = dict(list(dframe.groupby('k1')))

In [25]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,1.398006,2.536178,X,alpha
1,1.363261,-2.000787,X,beta


In [26]:
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes,axis=1)))

In [28]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0  1.398006  2.536178
 1  1.363261 -2.000787
 2  0.030641 -2.636798
 3 -0.145497 -1.235475
 4 -1.511416 -0.860866, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [30]:
# Using groupby with columns
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,2.536178
X,beta,-2.000787
Y,alpha,-2.636798
Y,beta,-1.235475
Z,alpha,-0.860866
