In [1]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.874009,-0.067027
1,bar,one,-0.648909,-0.053985
2,foo,two,-0.096479,0.024463
3,bar,three,-0.698216,1.187525
4,foo,two,0.918855,-0.081976
5,bar,two,1.734519,0.841634
6,foo,one,1.572423,0.423072
7,foo,three,0.904348,0.486459


In [2]:
g = df.groupby(['B', 'A'])
g

<pandas.core.groupby.DataFrameGroupBy object at 0x7f0788078810>

In [3]:
g.sum()
g.aggregate(np.sum) #equivalent

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
B,A,Unnamed: 2_level_1,Unnamed: 3_level_1
one,bar,-0.648909,-0.053985
one,foo,0.698413,0.356045
three,bar,-0.698216,1.187525
three,foo,0.904348,0.486459
two,bar,1.734519,0.841634
two,foo,0.822376,-0.057514


In [4]:
df.groupby(['A', 'B'], as_index=False).sum() # group, but don't make A and B indices
df.groupby(['A', 'B']).sum().reset_index() #equivalent

Unnamed: 0,A,B,C,D
0,bar,one,-0.648909,-0.053985
1,bar,three,-0.698216,1.187525
2,bar,two,1.734519,0.841634
3,foo,one,0.698413,0.356045
4,foo,three,0.904348,0.486459
5,foo,two,0.822376,-0.057514


In [5]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'

g = df.groupby(get_letter_type, axis=1)
g.get_group('consonant')
g.sum()


Unnamed: 0,consonant,vowel
0,-0.941037,foo
1,-0.702894,bar
2,-0.072016,foo
3,0.48931,bar
4,0.836879,foo
5,2.576153,bar
6,1.995495,foo
7,1.390807,foo


In [6]:
g.groups

{'consonant': Index([u'B', u'C', u'D'], dtype='object'),
 'vowel': Index([u'A'], dtype='object')}

In [7]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])

s = pd.Series(np.random.randn(8), index=index)

s

first  second
bar    one       0.515756
       two       0.456570
baz    one       0.424403
       two       0.204368
foo    one       0.361091
       two       0.406260
qux    one       0.251237
       two       0.630529
dtype: float64

In [8]:
s.groupby(level=0).sum()
s.groupby(level='second').sum() # same as level=1
s.sum(level='second') # same as line above

second
one    1.552488
two    1.697727
dtype: float64

In [9]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])

df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3],
                   'B': np.arange(8),
                   'C': np.arange(3, 11)},
                  index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1,0,3
bar,two,1,1,4
baz,one,1,2,5
baz,two,1,3,6
foo,one,2,4,7
foo,two,2,5,8
qux,one,3,6,9
qux,two,3,7,10


A DataFrame may be grouped by a combination of columns and index levels by specifying the column names as strings and the index levels as pd.Grouper objects.

In [10]:
# group by the 2nd index level and the 'A' column:
df.groupby([pd.Grouper(level=1), 'A']).sum()
# or, equivalently, feed in directly as keys to groupby()
df.groupby(['second', 'A']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
second,A,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,8
one,2,4,7
one,3,6,9
two,1,4,10
two,2,5,8
two,3,7,10


In [11]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1,0,3
bar,two,1,1,4
baz,one,1,2,5
baz,two,1,3,6
foo,one,2,4,7
foo,two,2,5,8
qux,one,3,6,9
qux,two,3,7,10


In [41]:
grouped = df.groupby('A')
for name, group in grouped:
    print('name = {}'.format(name))
    print(group)
grouped.get_group(1)

name = 1
              A  B  C
first second         
bar   one     1  0  3
      two     1  1  4
baz   one     1  2  5
      two     1  3  6
name = 2
              A  B  C
first second         
foo   one     2  4  7
      two     2  5  8
name = 3
              A  B   C
first second          
qux   one     3  6   9
      two     3  7  10


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,1,0,3
bar,two,1,1,4
baz,one,1,2,5
baz,two,1,3,6


In [54]:
df_re = pd.DataFrame({'A': [1] * 5 + [5] * 5,
                      'B': np.arange(10)})
print(df_re)
#whereas aggregate lowers the dims, transform functions dont

#aggregate
df_re.groupby('A').sum()

   A  B
0  1  0
1  1  1
2  1  2
3  1  3
4  1  4
5  5  5
6  5  6
7  5  7
8  5  8
9  5  9


Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1,10
5,35


In [56]:
#transform
df_re.groupby('A').expanding().sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1.0,0.0
1,1,2.0,1.0
1,2,3.0,3.0
1,3,4.0,6.0
1,4,5.0,10.0
5,5,5.0,5.0
5,6,10.0,11.0
5,7,15.0,18.0
5,8,20.0,26.0
5,9,25.0,35.0


In [57]:
# filter functions return a subset of the original object
dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
print(dff)

dff.groupby('B').filter(lambda x: len(x) > 2) #only pass groups with more than 2 rows.

   A  B
0  0  a
1  1  a
2  2  b
3  3  b
4  4  b
5  5  b
6  6  c
7  7  c


Unnamed: 0,A,B
2,2,b
3,3,b
4,4,b
5,5,b


In [58]:
dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) # fill w NaN instead of dropping..

Unnamed: 0,A,B
0,,
1,,
2,2.0,b
3,3.0,b
4,4.0,b
5,5.0,b
6,,
7,,


In [59]:
#apply can apply any function to each group
dff.groupby('B').apply(lambda x: x.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,A
B,Unnamed: 1_level_1,Unnamed: 2_level_1
a,count,2.0
a,mean,0.5
a,std,0.707107
a,min,0.0
a,25%,0.25
a,50%,0.5
a,75%,0.75
a,max,1.0
b,count,4.0
b,mean,3.5


In [61]:
# variables of 'Categorical' class can be used as groupby keys
data = pd.Series(np.random.randn(100))

factor = pd.qcut(data, [0, .25, .5, .75, 1.])

data.groupby(factor).mean()

(-2.53, -0.781]     -1.264460
(-0.781, -0.0193]   -0.433101
(-0.0193, 0.53]      0.266350
(0.53, 2.139]        1.081322
dtype: float64