In [1]:
# GroupBy技术

In [51]:
from pandas import Series, DataFrame
import numpy as np
import pandas as pd

In [8]:
df = DataFrame({'key1' : ['a','a','b','b','a'],
               'key2' : ['one','two','one','two','one'],
               'data1' : np.random.randn(5),
               'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.755715,-1.275347,a,one
1,-0.94191,0.232741,a,two
2,2.15706,0.468057,b,one
3,-0.664274,-1.29586,b,two
4,-0.656189,-1.292306,a,one


In [9]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000020B1BA9CB38>

In [10]:
grouped.mean()

key1
a   -0.784605
b    0.746393
Name: data1, dtype: float64

In [11]:
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means

key1  key2
a     one    -0.705952
      two    -0.941910
b     one     2.157060
      two    -0.664274
Name: data1, dtype: float64

In [12]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.705952,-0.94191
b,2.15706,-0.664274


In [13]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])

df['data1'].groupby([states, years]).mean()

California  2005   -0.941910
            2006    2.157060
Ohio        2005   -0.709995
            2006   -0.656189
Name: data1, dtype: float64

In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.784605,-0.778304
b,0.746393,-0.413901


In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.705952,-1.283827
a,two,-0.94191,0.232741
b,one,2.15706,0.468057
b,two,-0.664274,-1.29586


In [16]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [17]:
# 对分组进行迭代

In [19]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.755715 -1.275347    a  one
1 -0.941910  0.232741    a  two
4 -0.656189 -1.292306    a  one
b
      data1     data2 key1 key2
2  2.157060  0.468057    b  one
3 -0.664274 -1.295860    b  two


In [20]:
for (k1, k2), group in df.groupby(['key1','key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -0.755715 -1.275347    a  one
4 -0.656189 -1.292306    a  one
a two
     data1     data2 key1 key2
1 -0.94191  0.232741    a  two
b one
     data1     data2 key1 key2
2  2.15706  0.468057    b  one
b two
      data1    data2 key1 key2
3 -0.664274 -1.29586    b  two


In [21]:
pieces = dict(list(df.groupby('key1')))
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,2.15706,0.468057,b,one
3,-0.664274,-1.29586,b,two


In [22]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [23]:
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.755715 -1.275347
 1 -0.941910  0.232741
 2  2.157060  0.468057
 3 -0.664274 -1.295860
 4 -0.656189 -1.292306, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [24]:
# 选取一个或一组列

In [27]:
df.groupby(['key1','key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-1.283827
a,two,0.232741
b,one,0.468057
b,two,-1.29586


In [28]:
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x0000020B1BAF3710>

In [29]:
s_grouped.mean()

key1  key2
a     one    -1.283827
      two     0.232741
b     one     0.468057
      two    -1.295860
Name: data2, dtype: float64

In [30]:
# 通过字典或Series进行分组

In [31]:
people = DataFrame(np.random.rand(5, 5),
                  columns = ['a','b','c','d','e'],
                  index = ['Joe','Steve','Wes','Jim','Travis'])

In [32]:
people.ix[2:3, ['b','c']] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,0.082081,0.241639,0.63238,0.089753,0.988804
Steve,0.421158,0.328599,0.968638,0.480866,0.879012
Wes,0.371469,,,0.428397,0.851432
Jim,0.723372,0.453935,0.463544,0.617925,0.629778
Travis,0.219426,0.851317,0.07933,0.835396,0.118194


In [33]:
mapping = {'a':'red','b':'red','c':'blue',
          'd':'blue','e':'red','f':'orange'}

In [37]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,0.722132,1.312524
Steve,1.449504,1.62877
Wes,0.428397,1.222901
Jim,1.08147,1.807086
Travis,0.914726,1.188936


In [40]:
map_series = Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [41]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [44]:
# 通过函数进行分组

In [45]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.082081,0.241639,0.63238,0.089753,0.988804
Steve,0.421158,0.328599,0.968638,0.480866,0.879012
Wes,0.371469,,,0.428397,0.851432
Jim,0.723372,0.453935,0.463544,0.617925,0.629778
Travis,0.219426,0.851317,0.07933,0.835396,0.118194


In [43]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.176922,0.695574,1.095924,1.136075,2.470015
5,0.421158,0.328599,0.968638,0.480866,0.879012
6,0.219426,0.851317,0.07933,0.835396,0.118194


In [46]:
key_list = ['one','one','one','two','two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.082081,0.241639,0.63238,0.089753,0.851432
3,two,0.723372,0.453935,0.463544,0.617925,0.629778
5,one,0.421158,0.328599,0.968638,0.480866,0.879012
6,two,0.219426,0.851317,0.07933,0.835396,0.118194


In [47]:
# 根据索引级别分组

In [52]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]], names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.705335,0.916254,-0.0749,1.552184,-1.446769
1,1.117858,1.845177,-1.63719,0.132305,0.6587
2,0.212099,-1.672146,0.436141,-0.078243,0.201779
3,-1.038644,-0.002254,1.04117,0.694065,-0.222742


In [58]:
hier_df.groupby(level='cty',axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


In [59]:
# 数据聚合

In [60]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.755715,-1.275347,a,one
1,-0.94191,0.232741,a,two
2,2.15706,0.468057,b,one
3,-0.664274,-1.29586,b,two
4,-0.656189,-1.292306,a,one


In [61]:
grouped = df.groupby('key1')

In [62]:
grouped['data1'].quantile(0.9)

key1
a   -0.676095
b    1.874927
Name: data1, dtype: float64

In [63]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [64]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.28572,1.525048
b,2.821334,1.763918


In [65]:
grouped.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,count,3.0,3.0
a,mean,-0.784605,-0.778304
a,std,0.145034,0.875632
a,min,-0.94191,-1.292306
a,25%,-0.848812,-1.283827
a,50%,-0.755715,-1.275347
a,75%,-0.705952,-0.521303
a,max,-0.656189,0.232741
b,count,2.0,2.0
b,mean,0.746393,-0.413901


In [66]:
# 面向列的多函数应用

In [None]:
grouped = tip