In [2]:
import numpy as np 
import pandas as pd
from pandas import DataFrame, Series

In [3]:
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)
                   })
dframe

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.304272,3.631047,X,alpha
1,0.082075,-0.467932,X,beta
2,-0.994337,-0.046106,Y,alpha
3,0.11006,-1.931603,Y,beta
4,-0.041135,1.229793,Z,alpha


In [4]:
group1 = dframe['dataset1'].groupby(dframe['k1'])

In [5]:
group1

<pandas.core.groupby.SeriesGroupBy object at 0x0000000005973C18>

In [6]:
group1.mean()

k1
X   -0.111099
Y   -0.442139
Z   -0.041135
Name: dataset1, dtype: float64

In [7]:
cities = np.array(['NY','LA','LA','NY','NY'])
month = np.array(['Jan','Feb','Jan','Feb','Jan'])

In [8]:
dframe['dataset1'].groupby([cities,month]).mean()

LA  Feb    0.082075
    Jan   -0.994337
NY  Feb    0.110060
    Jan   -0.172703
Name: dataset1, dtype: float64

In [9]:
dframe.groupby('k1').mean()

Unnamed: 0_level_0,dataset1,dataset2
k1,Unnamed: 1_level_1,Unnamed: 2_level_1
X,-0.111099,1.581558
Y,-0.442139,-0.988854
Z,-0.041135,1.229793


In [10]:
dframe.groupby(['k1','k2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset1,dataset2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
X,alpha,-0.304272,3.631047
X,beta,0.082075,-0.467932
Y,alpha,-0.994337,-0.046106
Y,beta,0.11006,-1.931603
Z,alpha,-0.041135,1.229793


In [11]:
dframe.groupby('k1').size()

k1
X    2
Y    2
Z    1
dtype: int64

In [12]:
for name, group in dframe.groupby('k1'):
    print("This is the %s group"%name)
    print(group)
    print('\n')

This is the X group
   dataset1  dataset2 k1     k2
0 -0.304272  3.631047  X  alpha
1  0.082075 -0.467932  X   beta


This is the Y group
   dataset1  dataset2 k1     k2
2 -0.994337 -0.046106  Y  alpha
3  0.110060 -1.931603  Y   beta


This is the Z group
   dataset1  dataset2 k1     k2
4 -0.041135  1.229793  Z  alpha




In [13]:
for (k1,k2), group in dframe.groupby(['k1','k2']):
    print("key1 = %s, key2 = %s "%(k1, k2))
    print(group)
    print("\n")

key1 = X, key2 = alpha 
   dataset1  dataset2 k1     k2
0 -0.304272  3.631047  X  alpha


key1 = X, key2 = beta 
   dataset1  dataset2 k1    k2
1  0.082075 -0.467932  X  beta


key1 = Y, key2 = alpha 
   dataset1  dataset2 k1     k2
2 -0.994337 -0.046106  Y  alpha


key1 = Y, key2 = beta 
   dataset1  dataset2 k1    k2
3   0.11006 -1.931603  Y  beta


key1 = Z, key2 = alpha 
   dataset1  dataset2 k1     k2
4 -0.041135  1.229793  Z  alpha




In [14]:
group_dict = dict(list(dframe.groupby('k1')))
group_dict

{'X':    dataset1  dataset2 k1     k2
 0 -0.304272  3.631047  X  alpha
 1  0.082075 -0.467932  X   beta, 'Y':    dataset1  dataset2 k1     k2
 2 -0.994337 -0.046106  Y  alpha
 3  0.110060 -1.931603  Y   beta, 'Z':    dataset1  dataset2 k1     k2
 4 -0.041135  1.229793  Z  alpha}

In [15]:
group_dict['X']

Unnamed: 0,dataset1,dataset2,k1,k2
0,-0.304272,3.631047,X,alpha
1,0.082075,-0.467932,X,beta


In [16]:
group_dict_axis1 = dict(list(dframe.groupby(dframe.dtypes, axis = 1)))

In [17]:
group_dict_axis1

{dtype('float64'):    dataset1  dataset2
 0 -0.304272  3.631047
 1  0.082075 -0.467932
 2 -0.994337 -0.046106
 3  0.110060 -1.931603
 4 -0.041135  1.229793, dtype('O'):   k1     k2
 0  X  alpha
 1  X   beta
 2  Y  alpha
 3  Y   beta
 4  Z  alpha}

In [18]:
dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

dataset2_group.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset2
k1,k2,Unnamed: 2_level_1
X,alpha,3.631047
X,beta,-0.467932
Y,alpha,-0.046106
Y,beta,-1.931603
Z,alpha,1.229793


In [21]:
animals = DataFrame(np.arange(16).reshape(4,4),
                   columns = ['W','X','Y','Z'],
                   index = ['Dog','Cat','Bird','Mouse']
                  )
animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,4,5,6,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [23]:
animals.ix[1:2,['W','Y']] = np.nan

In [28]:
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [29]:
behavior_map = {'W':'good','X':'bad','Y':'good','Z':'bad'}

In [30]:
animals_col = animals.groupby(behavior_map, axis = 1)
animals_col.sum()

Unnamed: 0,bad,good
Dog,4.0,2.0
Cat,12.0,0.0
Bird,20.0,18.0
Mouse,28.0,26.0


In [31]:
behav_series = Series(behavior_map)
behav_series

W    good
X     bad
Y    good
Z     bad
dtype: object

In [34]:
animals.groupby(behav_series,axis = 1).count()

Unnamed: 0,bad,good
Dog,2,2
Cat,2,0
Bird,2,2
Mouse,2,2


In [35]:
print(animals)
animals.groupby(len).sum()

          W   X     Y   Z
Dog     0.0   1   2.0   3
Cat     NaN   5   NaN   7
Bird    8.0   9  10.0  11
Mouse  12.0  13  14.0  15


Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,12.0,13,14.0,15


In [36]:
keys = ['A','B','A','B']

In [39]:
animals.groupby([len,keys]).max()

Unnamed: 0,Unnamed: 1,W,X,Y,Z
3,A,0.0,1,2.0,3
3,B,,5,,7
4,A,8.0,9,10.0,11
5,B,12.0,13,14.0,15


In [42]:
hier_col = pd.MultiIndex.from_arrays([['NY','NY','NY','Sf','SF'],[1,2,3,1,3]],
                                    names = ['City','sub_value'])

In [47]:
dframe_hr = DataFrame(np.arange(25).reshape(5,5), columns = hier_col)

In [48]:
dframe_hr = dframe_hr * 100

In [49]:
dframe_hr

City,NY,NY,NY,Sf,SF
sub_value,1,2,3,1,3
0,0,100,200,300,400
1,500,600,700,800,900
2,1000,1100,1200,1300,1400
3,1500,1600,1700,1800,1900
4,2000,2100,2200,2300,2400
