In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                              'key2': ['one', 'two', 'one', 'two', 'one'],
                              'data1': np.random.randn(5),
                              'data2': np.random.randn(5)})

In [3]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.376335,0.164818,a,one
1,-0.987691,0.163065,a,two
2,0.445739,-0.205921,b,one
3,-0.250776,0.945238,b,two
4,0.587784,0.133218,a,one


In [4]:
grouped = df['data1'].groupby(df['key1'])

In [5]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000007C0B5C5C18>

In [6]:
grouped.mean()

key1
a   -0.258747
b    0.097481
Name: data1, dtype: float64

In [7]:
# split - apply - combine

In [10]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [11]:
means

key1  key2
a     one     0.105724
      two    -0.987691
b     one     0.445739
      two    -0.250776
Name: data1, dtype: float64

In [12]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.105724,-0.987691
b,0.445739,-0.250776


In [13]:
df.groupby("key1").mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.258747,0.1537
b,0.097481,0.369659


In [14]:
df.groupby('key1').count()

Unnamed: 0_level_0,data1,data2,key2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,3,3,3
b,2,2,2


In [16]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.105724,0.149018
a,two,-0.987691,0.163065
b,one,0.445739,-0.205921
b,two,-0.250776,0.945238


In [17]:
df.groupby(['key1', 'key2']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,2
a,two,1,1
b,one,1,1
b,two,1,1


In [18]:
df.groupby(['key1', 'key2'])['data2'].mean()

key1  key2
a     one     0.149018
      two     0.163065
b     one    -0.205921
      two     0.945238
Name: data2, dtype: float64

In [19]:
df.groupby('key1')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000007C0B10FD30>

In [20]:
for name, group in df.groupby("key1"):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.376335  0.164818    a  one
1 -0.987691  0.163065    a  two
4  0.587784  0.133218    a  one
b
      data1     data2 key1 key2
2  0.445739 -0.205921    b  one
3 -0.250776  0.945238    b  two


In [22]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print (k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -0.376335  0.164818    a  one
4  0.587784  0.133218    a  one
a two
      data1     data2 key1 key2
1 -0.987691  0.163065    a  two
b one
      data1     data2 key1 key2
2  0.445739 -0.205921    b  one
b two
      data1     data2 key1 key2
3 -0.250776  0.945238    b  two


In [23]:
pieces = dict(list(df.groupby("key1")))

In [24]:
pieces

{'a':       data1     data2 key1 key2
 0 -0.376335  0.164818    a  one
 1 -0.987691  0.163065    a  two
 4  0.587784  0.133218    a  one, 'b':       data1     data2 key1 key2
 2  0.445739 -0.205921    b  one
 3 -0.250776  0.945238    b  two}

In [25]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,0.445739,-0.205921,b,one
3,-0.250776,0.945238,b,two


In [26]:
df2 = pd.DataFrame(np.random.rand(5, 5),
                              index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'],
                              columns = ['a', 'b', 'c', 'd', 'e'])

In [27]:
df2

Unnamed: 0,a,b,c,d,e
Joe,0.83556,0.138188,0.361133,0.766631,0.730745
Steve,0.333403,0.467382,0.339506,0.499131,0.837752
Wes,0.417295,0.20797,0.773529,0.486885,0.592073
Jim,0.461641,0.038569,0.330362,0.293128,0.34197
Travis,0.852069,0.055351,0.373113,0.062404,0.27038


In [28]:
map_dict = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [29]:
df2.groupby(map_dict, axis=1).sum()

Unnamed: 0,blue,red
Joe,1.127764,1.704493
Steve,0.838636,1.638538
Wes,1.260414,1.217338
Jim,0.62349,0.84218
Travis,0.435517,1.1778


In [30]:
map_s = pd.Series(map_dict)

In [32]:
map_s

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [31]:
df2.groupby(map_s, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,2,3
Jim,2,3
Travis,2,3


In [33]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.376335,0.164818,a,one
1,-0.987691,0.163065,a,two
2,0.445739,-0.205921,b,one
3,-0.250776,0.945238,b,two
4,0.587784,0.133218,a,one


In [34]:
grouped = df.groupby('key1')

In [35]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [36]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.575474,0.0316
b,0.696515,1.151159


In [37]:
grouped.agg("std")

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.794292,0.01776
b,0.49251,0.813993


In [38]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.258747,0.794292,-0.987691,-0.682013,-0.376335,0.105724,0.587784,3.0,0.1537,0.01776,0.133218,0.148141,0.163065,0.163941,0.164818
b,2.0,0.097481,0.49251,-0.250776,-0.076647,0.097481,0.27161,0.445739,2.0,0.369659,0.813993,-0.205921,0.081869,0.369659,0.657449,0.945238
