In [1]:
import numpy as np
import pandas as pd

# 9.1.3 用Dicts与Series进行分组
分组信息可以不是数组的形式

In [2]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.323687,-0.573106,-1.024286,-1.318013,-0.026259
Steve,1.477453,-2.90792,0.556639,1.105747,-0.793825
Wes,-0.849232,1.733706,-1.441494,0.70472,-0.128081
Jim,-0.467004,-0.116266,-1.436847,0.170116,-0.231405
Travis,-1.449027,-0.144839,0.474575,-0.03104,-1.580513


In [3]:
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-0.323687,-0.573106,-1.024286,-1.318013,-0.026259
Steve,1.477453,-2.90792,0.556639,1.105747,-0.793825
Wes,-0.849232,,,0.70472,-0.128081
Jim,-0.467004,-0.116266,-1.436847,0.170116,-0.231405
Travis,-1.449027,-0.144839,0.474575,-0.03104,-1.580513


假设我们有一个组，对应多个列，而且我们想要按组把这些列的和计算出来：

In [4]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [5]:
by_column = people.groupby(mapping, axis=1)
for name, table in by_column:
    print(name)
    print(table)
    print()

blue
               c         d
Joe    -1.024286 -1.318013
Steve   0.556639  1.105747
Wes          NaN  0.704720
Jim    -1.436847  0.170116
Travis  0.474575 -0.031040

red
               a         b         e
Joe    -0.323687 -0.573106 -0.026259
Steve   1.477453 -2.907920 -0.793825
Wes    -0.849232       NaN -0.128081
Jim    -0.467004 -0.116266 -0.231405
Travis -1.449027 -0.144839 -1.580513



In [6]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-2.342299,-0.923051
Steve,1.662387,-2.224291
Wes,0.70472,-0.977312
Jim,-1.266732,-0.814675
Travis,0.443535,-3.174378


这种用法同样适用于series，这种情况可以看作是固定大小的映射（fixed-size mapping）:

In [7]:
map_series = pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [8]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.323687,-0.573106,-1.024286,-1.318013,-0.026259
Steve,1.477453,-2.90792,0.556639,1.105747,-0.793825
Wes,-0.849232,,,0.70472,-0.128081
Jim,-0.467004,-0.116266,-1.436847,0.170116,-0.231405
Travis,-1.449027,-0.144839,0.474575,-0.03104,-1.580513


In [9]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [10]:
people.groupby(map_series, axis=1).size()

blue    2
red     3
dtype: int64