In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'key1': ['a', np.nan, 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.081668,1.779884
1,,two,1.344932,-0.431063
2,b,one,1.101075,-0.02049
3,b,two,0.310745,1.415755
4,a,one,1.904948,-1.953505


In [3]:
# 按'key1'列(Series)进行分组
grouped = df['data1'].groupby(df['key1'])

In [4]:
grouped  # SeriesGroupBy对象

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002DC0AD3E310>

In [5]:
# Compute group sizes.
grouped.size()  # size方法返回一个包含组大小信息的Series

key1
a    2
b    2
Name: data1, dtype: int64

In [6]:
for group in grouped:
    print(group)  # 分组中任何缺失值将被排除在结果之外

('a', 0   -0.081668
4    1.904948
Name: data1, dtype: float64)
('b', 2    1.101075
3    0.310745
Name: data1, dtype: float64)


In [7]:
grouped.mean()  # 计算每一个分组下的均值

key1
a    0.91164
b    0.70591
Name: data1, dtype: float64

In [8]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()  # 使用多个组分组
means  # 返回一个包含唯一键对的多层索引

key1  key2
a     one     0.911640
b     one     1.101075
      two     0.310745
Name: data1, dtype: float64

In [9]:
means.unstack()  # unstack:行索引转为列索引

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.91164,
b,1.101075,0.310745


In [10]:
type(df.groupby('key1'))  # DataFrameGroupBy对象

pandas.core.groupby.generic.DataFrameGroupBy

In [11]:
df.groupby(df['key1']).mean()  # 默认情况下,对剩下的所有列进行分组

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.91164,-0.08681
b,0.70591,0.697632


In [12]:
df.groupby([df['key1'], df['key2']]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.91164,-0.08681
b,one,1.101075,-0.02049
b,two,0.310745,1.415755


In [13]:
df.groupby([df['key1'], df['key2']], as_index=True).mean()  # 默认as_index=True

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.91164,-0.08681
b,one,1.101075,-0.02049
b,two,0.310745,1.415755


In [14]:
'''
as_indexbool, default True
    For aggregated output, return object with group labels as the index.
'''
df.groupby([df['key1'], df['key2']], as_index=False).mean()

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.91164,-0.08681
1,b,one,1.101075,-0.02049
2,b,two,0.310745,1.415755


In [15]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

# 按顺序进行分组
# states按序依次对应df.index(axis=0时)
# years按序依次对应df.index(axis=0时)
df['data1'].groupby([states, years]).mean()  # 默认axis=0

California  2005    1.344932
            2006    1.101075
Ohio        2005    0.114538
            2006    1.904948
Name: data1, dtype: float64

In [16]:
df1 = pd.DataFrame({'key1': ['Ohio', 'California', 'California', 'Ohio', 'Ohio'],
                    'key2': [2005, 2005, 2006, 2005, 2006],
                    'data1': np.random.randn(5),
                    'data2': np.random.randn(5)})
df1

Unnamed: 0,key1,key2,data1,data2
0,Ohio,2005,0.022639,-0.792303
1,California,2005,-0.700193,1.034196
2,California,2006,1.320967,2.286996
3,Ohio,2005,1.316702,0.6262
4,Ohio,2006,0.606941,-1.827116


In [17]:
means = df1['data1'].groupby([df1['key1'], df1['key2']]).mean()
# 与df1.groupby([df1['key1'], df1['key2']])['data1'].mean()等价
means  # 与上等价

key1        key2
California  2005   -0.700193
            2006    1.320967
Ohio        2005    0.669671
            2006    0.606941
Name: data1, dtype: float64

In [18]:
df1['data1'].groupby([df1.loc[:, 'key1'], df1['key2']]).mean()



key1        key2
California  2005   -0.700193
            2006    1.320967
Ohio        2005    0.669671
            2006    0.606941
Name: data1, dtype: float64