In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [3]:
df = DataFrame({'key1':['a','b','c','a','c'],
               'key2':['one','two','one','two','three'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,1.023629,2.087165,a,one
1,0.789319,0.728076,b,two
2,-0.982533,-0.319293,c,one
3,0.216595,-0.473751,a,two
4,-0.12735,0.3495,c,three


按 key1 进行分组并计算 data1 的平均值，实现方式很多，这里用： 访问 data1，并根据 key1 调用 groupby。

In [4]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f2e11b80080>

变量 grouped 是一个 Groupby 对象，它实际上没有进行任何计算，但已经包含了执行运算所需的一切。

In [5]:
#计算平均值
grouped.mean()

key1
a    0.620112
b    0.789319
c   -0.554942
Name: data1, dtype: float64

如果一次传入多个数组，会得到如下结果：

In [9]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

key1  key2 
a     one      1.023629
      two      0.216595
b     two      0.789319
c     one     -0.982533
      three   -0.127350
Name: data1, dtype: float64

In [10]:
means.unstack()

key2,one,three,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.023629,,0.216595
b,,,0.789319
c,-0.982533,-0.12735,


- 实际上分组键可以是任意长度适合的数组，如下：

In [13]:
import numpy as np
stats = np.array(['A','B','A','A','B'])
years = np.array([2005,2006,2006,2006,2005])
df['data1'].groupby([stats,years]).mean()

A  2005    1.023629
   2006   -0.382969
B  2005   -0.127350
   2006    0.789319
Name: data1, dtype: float64

此外，你还可以把列名用作分组键：

In [14]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.620112,0.806707
b,0.789319,0.728076
c,-0.554942,0.015104


可以看到key2列没有，这是因为该列不是数值，俗称‘麻烦列’，所以从结果中排除了。

In [15]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.023629,2.087165
a,two,0.216595,-0.473751
b,two,0.789319,0.728076
c,one,-0.982533,-0.319293
c,three,-0.12735,0.3495


- size方法，返回一个包含分组大小的Series。

In [16]:
df.groupby(['key1','key2']).size()

key1  key2 
a     one      1
      two      1
b     two      1
c     one      1
      three    1
dtype: int64

In [17]:
df.groupby('key1').size()

key1
a    2
b    1
c    2
dtype: int64

## 对分组进行迭代

In [18]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0  1.023629  2.087165    a  one
3  0.216595 -0.473751    a  two
b
      data1     data2 key1 key2
1  0.789319  0.728076    b  two
c
      data1     data2 key1   key2
2 -0.982533 -0.319293    c    one
4 -0.127350  0.349500    c  three


In [21]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)

a one
      data1     data2 key1 key2
0  1.023629  2.087165    a  one
a two
      data1     data2 key1 key2
3  0.216595 -0.473751    a  two
b two
      data1     data2 key1 key2
1  0.789319  0.728076    b  two
c one
      data1     data2 key1 key2
2 -0.982533 -0.319293    c  one
c three
     data1   data2 key1   key2
4 -0.12735  0.3495    c  three


- 有一个可能有用的运算，将数据片段做成一个字典：

In [22]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':       data1     data2 key1 key2
 0  1.023629  2.087165    a  one
 3  0.216595 -0.473751    a  two, 'b':       data1     data2 key1 key2
 1  0.789319  0.728076    b  two, 'c':       data1     data2 key1   key2
 2 -0.982533 -0.319293    c    one
 4 -0.127350  0.349500    c  three}

In [23]:
pieces['a']

Unnamed: 0,data1,data2,key1,key2
0,1.023629,2.087165,a,one
3,0.216595,-0.473751,a,two


- groupby 默认是在axis=0上进行分组，也可以在其他任何轴上分组，下面根据dtype对列分组：

In [24]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [25]:
grouped = df.groupby(df.dtypes,axis=1)

In [26]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0  1.023629  2.087165
 1  0.789319  0.728076
 2 -0.982533 -0.319293
 3  0.216595 -0.473751
 4 -0.127350  0.349500, dtype('O'):   key1   key2
 0    a    one
 1    b    two
 2    c    one
 3    a    two
 4    c  three}

- 语法糖

    选取部分列进行聚合，尤其对于大数据量，很可能只需对部分列进行聚合：

In [28]:
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2e11b233c8>

是以下代码的语法糖：

In [29]:
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2e11b23a58>

## 通过字典或Series进行分组

In [30]:
people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe', 'Steve', 'Wes', 'Jim', 'Tom'])


In [31]:
people.ix[2:3, ['b','c']]=np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.265614,0.086115,1.216777,0.715948,0.742495
Steve,1.385583,0.879546,0.32888,0.349437,0.286163
Wes,0.959472,,,-0.363566,2.373446
Jim,0.311447,0.32414,-0.935236,0.48004,0.395466
Tom,0.576234,0.272281,-0.271944,-0.192782,-0.208079


In [32]:
mapping = {'a':'red','b':'red','c':'blue', 'd':'blue', 'e':'red', 'f':'orange'}

In [33]:
by_column = people.groupby(mapping,axis=1)

In [34]:
by_column.sum()

Unnamed: 0,blue,red
Joe,1.932725,-0.437004
Steve,0.678317,2.551292
Wes,-0.363566,3.332918
Jim,-0.455196,1.031053
Tom,-0.464726,0.640435


In [35]:
by_column.count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Tom,2,3


### 通过函数进行分组

In [41]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.265614,0.086115,1.216777,0.715948,0.742495
Steve,1.385583,0.879546,0.32888,0.349437,0.286163
Wes,0.959472,,,-0.363566,2.373446
Jim,0.311447,0.32414,-0.935236,0.48004,0.395466
Tom,0.576234,0.272281,-0.271944,-0.192782,-0.208079


In [42]:
#按名字长度分组
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.581539,0.682536,0.009597,0.63964,3.303328
5,1.385583,0.879546,0.32888,0.349437,0.286163
