In [129]:
import numpy as np
import pandas as pd

In [130]:
df = pd.DataFrame({'key1': ['a', np.nan, 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.146364,-1.009102
1,,two,1.835879,-1.346603
2,b,one,-1.406342,-1.56415
3,b,two,0.190897,0.473445
4,a,one,1.593057,0.044242


In [131]:
# 按'key1'列(Series)进行分组
grouped = df['data1'].groupby(df['key1'])

In [132]:
grouped  # SeriesGroupBy对象

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002A51C5995B0>

In [133]:
# Compute group sizes.
grouped.size()  # size方法返回一个包含组大小信息的Series

key1
a    2
b    2
Name: data1, dtype: int64

In [134]:
for group in grouped:
    print(group)  # 分组中任何缺失值将被排除在结果之外

('a', 0   -1.146364
4    1.593057
Name: data1, dtype: float64)
('b', 2   -1.406342
3    0.190897
Name: data1, dtype: float64)


In [135]:
grouped.mean()  # 计算每一个分组下的均值

key1
a    0.223346
b   -0.607722
Name: data1, dtype: float64

In [136]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()  # 使用多个组分组
means  # 返回一个包含唯一键对的多层索引

key1  key2
a     one     0.223346
b     one    -1.406342
      two     0.190897
Name: data1, dtype: float64

In [137]:
means.unstack()  # unstack:行索引转为列索引

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.223346,
b,-1.406342,0.190897


In [138]:
type(df.groupby('key1'))  # DataFrameGroupBy对象

pandas.core.groupby.generic.DataFrameGroupBy

In [139]:
df.groupby(df['key1']).mean()  # 默认情况下,对剩下的所有列进行分组

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.223346,-0.48243
b,-0.607722,-0.545352


In [140]:
df.groupby([df['key1'], df['key2']]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.223346,-0.48243
b,one,-1.406342,-1.56415
b,two,0.190897,0.473445


In [141]:
df.groupby([df['key1'], df['key2']], as_index=True).mean()  # 默认as_index=True

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.223346,-0.48243
b,one,-1.406342,-1.56415
b,two,0.190897,0.473445


In [142]:
'''
as_indexbool, default True
    For aggregated output, return object with group labels as the index.
'''
df.groupby([df['key1'], df['key2']], as_index=False).mean()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.223346,-0.48243
1,b,one,-1.406342,-1.56415
2,b,two,0.190897,0.473445


In [143]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

# 按顺序进行分组
# states按序依次对应df.index(axis=0时)
# years按序依次对应df.index(axis=0时)
df['data1'].groupby([states, years]).mean()  # 默认axis=0

California  2005    1.835879
            2006   -1.406342
Ohio        2005   -0.477734
            2006    1.593057
Name: data1, dtype: float64

In [144]:
df1 = pd.DataFrame({'key1': ['Ohio', 'California', 'California', 'Ohio', 'Ohio'],
                    'key2': [2005, 2005, 2006, 2005, 2006],
                    'data1': np.random.randn(5),
                    'data2': np.random.randn(5)})
df1

Unnamed: 0,key1,key2,data1,data2
0,Ohio,2005,0.047783,-1.211667
1,California,2005,-0.024795,-0.568707
2,California,2006,-1.011115,0.448186
3,Ohio,2005,0.596661,-0.642077
4,Ohio,2006,0.151543,0.857028


In [145]:
means = df1['data1'].groupby([df1['key1'], df1['key2']]).mean()
# 与df1.groupby([df1['key1'], df1['key2']])['data1'].mean()等价
means  # 与上等价

key1        key2
California  2005   -0.024795
            2006   -1.011115
Ohio        2005    0.322222
            2006    0.151543
Name: data1, dtype: float64

In [146]:
df1['data1'].groupby([df1.loc[:, 'key1'], df1['key2']]).mean()



key1        key2
California  2005   -0.024795
            2006   -1.011115
Ohio        2005    0.322222
            2006    0.151543
Name: data1, dtype: float64