In [1]:
import numpy as np
import pandas as pd

In [2]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3, [1, 2]] = np.nan  # Add a few NA values
people

Unnamed: 0,a,b,c,d,e
Joe,0.964514,-1.690478,-0.858596,-1.760698,1.255966
Steve,0.606576,1.209072,-0.06862,-0.982205,1.017329
Wes,0.337924,,,-1.838438,-1.605933
Jim,-0.230851,-0.54778,-0.292858,0.535523,-0.193637
Travis,1.134315,1.317166,0.664279,0.553034,-0.372935


In [3]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red'}

In [4]:
# If a dict or Series is passed, the Series or dict VALUES will be used to determine the groups
# 利用字典的映射关系进行分组(相比按顺序分组更加灵活)
by_column = people.groupby(mapping, axis=1)  # 对列进行分组

In [5]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.619295,0.530002
Steve,-1.050826,2.832977
Wes,-1.838438,-1.268009
Jim,0.242665,-0.972268
Travis,1.217314,2.078546


In [6]:
mapping1 = {'Joe': 'good', 'Steve': 'just so so', 'Wes': 'just so so',
            'Jim': 'good', 'Travis': 'just so so'}

In [7]:
by_column1 = people.groupby(mapping1, axis=0)  # 对行进行分组

In [8]:
by_column1.sum()

Unnamed: 0,a,b,c,d,e
good,0.733663,-2.238258,-1.151455,-1.225175,1.062329
just so so,2.078814,2.526238,0.595659,-2.267609,-0.961539


In [9]:
# Series与字典类似(即也存在映射关系)
map_series = pd.Series(mapping)
map_series

a     red
b     red
c    blue
d    blue
e     red
dtype: object

In [10]:
people.groupby(map_series, axis=1).sum()

Unnamed: 0,blue,red
Joe,-2.619295,0.530002
Steve,-1.050826,2.832977
Wes,-1.838438,-1.268009
Jim,0.242665,-0.972268
Travis,1.217314,2.078546


In [11]:
people.groupby(pd.Series(mapping1), axis=0).sum()

Unnamed: 0,a,b,c,d,e
good,0.733663,-2.238258,-1.151455,-1.225175,1.062329
just so so,2.078814,2.526238,0.595659,-2.267609,-0.961539


In [12]:
# 按顺序进行分组
# color_lst按序依次对应peple.columns(axis=1时)
color_lst = ['red', 'red', 'blue', 'blue', 'red']
arr = np.array(color_lst)
by_column1 = people.groupby(arr, axis=1)
by_column1.sum()

Unnamed: 0,blue,red
Joe,-2.619295,0.530002
Steve,-1.050826,2.832977
Wes,-1.838438,-1.268009
Jim,0.242665,-0.972268
Travis,1.217314,2.078546


In [13]:
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                   'key2': ['one', 'two', 'one', 'two', 'one'],
                   'data1': np.random.randn(5),
                   'data2': np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.967187,-0.577494
1,a,two,-0.650766,0.684579
2,b,one,-0.34778,1.230556
3,b,two,-0.257635,-0.493592
4,a,one,1.057957,1.824666


In [14]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [15]:
grouped = df.groupby(df.dtypes, axis=1)  # 默认axis=0

In [21]:
# 不同数据类型列得到了分离
list(grouped)

[(dtype('float64'),
        data1     data2
  0  1.967187 -0.577494
  1 -0.650766  0.684579
  2 -0.347780  1.230556
  3 -0.257635 -0.493592
  4  1.057957  1.824666),
 (dtype('O'),
    key1 key2
  0    a  one
  1    a  two
  2    b  one
  3    b  two
  4    a  one)]