In [125]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division
from imports import *

from IPython.display import display_html
display_html("""<button onclick="$('.input, .prompt, .output_stderr, .output_error').toggle();">Toggle code</button>""", raw=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## groupby 技术

In [8]:
df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,-0.525327,-1.254976,a,one
1,-0.268294,-0.243178,a,two
2,-0.183095,-0.871701,b,one
3,0.987624,-1.081934,b,two
4,0.802735,-0.874125,a,one


In [13]:
grouped = df['data1'].groupby(df['key1'])
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.138704,-0.268294
b,-0.183095,0.987624


#### 分组键可以是任何适当的数组

In [15]:
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()

California  2005   -0.268294
            2006   -0.183095
Ohio        2005    0.231148
            2006    0.802735
Name: data1, dtype: float64

#### 可以只是列名，会自动对数据进行分组聚合，不是数值数据的列，会在结果中排除

In [18]:
df.groupby(['key1','key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0.138704,-1.064551
a,two,-0.268294,-0.243178
b,one,-0.183095,-0.871701
b,two,0.987624,-1.081934


#### size方法返回一个含有分组大小的Series
*缺失值会被排除在外*

In [20]:
df.groupby(['key1','key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

### 对分组进行迭代
#### 产生一组二元元组（由分组名和数据块组成）

In [23]:

for name,group in df.groupby('key1'):
    print name
    print group

a
      data1     data2 key1 key2
0 -0.525327 -1.254976    a  one
1 -0.268294 -0.243178    a  two
4  0.802735 -0.874125    a  one
b
      data1     data2 key1 key2
2 -0.183095 -0.871701    b  one
3  0.987624 -1.081934    b  two


#### 对于多重键的情况，元祖的第一个元素将会是由键值组成的元组

In [25]:
for (k1,k2),group in df.groupby(['key1','key2']):
    print k1,k2
    print group

a one
      data1     data2 key1 key2
0 -0.525327 -1.254976    a  one
4  0.802735 -0.874125    a  one
a two
      data1     data2 key1 key2
1 -0.268294 -0.243178    a  two
b one
      data1     data2 key1 key2
2 -0.183095 -0.871701    b  one
b two
      data1     data2 key1 key2
3  0.987624 -1.081934    b  two


#### groupby 的语法糖

In [28]:
## [[]]这样是得到DataFrame格式的
df.groupby(['key1','key2'])[['data1']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key1,key2,Unnamed: 2_level_1
a,one,0.138704
a,two,-0.268294
b,one,-0.183095
b,two,0.987624


#### 根据字典来进行分组

In [31]:
people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan # 添加几个NA值
people

Unnamed: 0,a,b,c,d,e
Joe,1.357943,-0.968798,-0.865491,-0.070917,1.147237
Steve,0.406718,0.089325,-0.760056,-0.079221,-1.038677
Wes,-0.346276,,,0.480575,-0.529742
Jim,-0.27423,-0.429292,0.614004,-1.27388,0.568683
Travis,0.418822,-0.157158,1.793653,0.16486,-1.35063


In [33]:
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_colunm = people.groupby(mapping,axis=1)
by_colunm.sum()

Unnamed: 0,blue,red
Joe,-0.936408,1.536382
Steve,-0.839278,-0.542634
Wes,0.480575,-0.876018
Jim,-0.659877,-0.13484
Travis,1.958513,-1.088965


#### 通过函数进行分组

In [35]:
# 通过字符串长度len进行分组
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,0.737437,-1.398091,-0.251487,-0.864222,1.186177
5,0.406718,0.089325,-0.760056,-0.079221,-1.038677
6,0.418822,-0.157158,1.793653,0.16486,-1.35063


#### 根据索引级别分组

In [38]:
columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]],names=['fsd','fds'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df

fsd,US,US,US,JP,JP
fds,1,3,5,1,3
0,-0.23903,-0.496217,-0.095604,-0.699881,-0.028542
1,0.362069,-0.153353,0.133007,0.576304,0.141946
2,-0.364694,1.086398,-1.336309,0.38358,1.231348
3,1.144514,-1.190388,-0.946973,-1.396947,-1.660083


In [40]:
hier_df.groupby(level='fsd',axis=1).count()

fsd,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


## 数据聚合

#### 聚合函数

In [42]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

key1
a    0.588529
b    0.870552
Name: data1, dtype: float64

#### 使用自己的聚合函数，只需要将其传入aggregate或者agg方法即可

In [44]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.328062,1.011798
b,1.17072,0.210233


函数名|说明
----|---
count|分组中非NA值得数量
sum|非NA值得和
mean|非NA值得平均值
median|非NA值得算数中位数
std、var|无偏标准差和方差
min、max|最大、最小
prod|积
first、last|第一个和最后一个非NA值


In [47]:
tips = pd.read_csv('./data/ch08/tips.csv')
# 添加小费占总额百分比的列
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
5,25.29,4.71,Male,No,Sun,Dinner,4,0.18624


In [60]:
grouped = tips.groupby(['sex','smoker'],as_index=False)
grouped_pct = grouped['tip_pct']
# 函数可以用字符串的形式传入,自定义的函数不需要用字符串
grouped_pct.agg('mean')
# 如果传入一组函数或函数名，得到的DataFrame列就会以相应的函数命名
grouped_pct.agg(['mean','std',peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,No,0.156921,0.036421,0.195876
Female,Yes,0.18215,0.071595,0.360233
Male,No,0.160669,0.041849,0.220186
Male,Yes,0.152771,0.090588,0.674707


#### mean ,std这样的函数显示的列名也是可以修改的

In [53]:
grouped_pct.agg([('foo','mean'),('bar',np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,No,0.156921,0.036421
Female,Yes,0.18215,0.071595
Male,No,0.160669,0.041849
Male,Yes,0.152771,0.090588


#### 对于DataFrame，还可以定义一组应用于全部列的函数，或者不同列应用不同的函数

In [55]:
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,No,54,0.156921,0.252672,54,18.105185,35.83
Female,Yes,33,0.18215,0.416667,33,17.977879,44.3
Male,No,97,0.160669,0.29199,97,19.791237,48.33
Male,Yes,60,0.152771,0.710345,60,22.2845,50.81


In [62]:
#这里也可以传入带有自定义名称的元组列表
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,No,0.156921,0.001327,18.105185,53.092422
Female,Yes,0.18215,0.005126,17.977879,84.451517
Male,No,0.160669,0.001751,19.791237,76.152961
Male,Yes,0.152771,0.008206,22.2845,98.244673


In [63]:
# 对不同的列应用不同的函数，向agg传入一个从列名映射到函数的字典
grouped.agg({'tip':np.max,'size':'sum'})
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})

Unnamed: 0_level_0,sex,smoker,tip_pct,tip_pct,tip_pct,tip_pct,size
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,max,mean,std,sum
0,Female,No,0.056797,0.252672,0.156921,0.036421,140
1,Female,Yes,0.056433,0.416667,0.18215,0.071595,74
2,Male,No,0.071804,0.29199,0.160669,0.041849,263
3,Male,Yes,0.035638,0.710345,0.152771,0.090588,150


## 分组级运算和转换

#### transform 会将一个函数应用到各个分组，然后将结果放置到适当的位置上

In [66]:
people.head()

Unnamed: 0,a,b,c,d,e
Joe,1.357943,-0.968798,-0.865491,-0.070917,1.147237
Steve,0.406718,0.089325,-0.760056,-0.079221,-1.038677
Wes,-0.346276,,,0.480575,-0.529742
Jim,-0.27423,-0.429292,0.614004,-1.27388,0.568683
Travis,0.418822,-0.157158,1.793653,0.16486,-1.35063


In [67]:
key = ['one','two','one','two','one']
#people.groupby(key).mean()
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.47683,-0.562978,0.464081,0.191506,-0.244378
Steve,0.066244,-0.169984,-0.073026,-0.676551,-0.234997
Wes,0.47683,-0.562978,0.464081,0.191506,-0.244378
Jim,0.066244,-0.169984,-0.073026,-0.676551,-0.234997
Travis,0.47683,-0.562978,0.464081,0.191506,-0.244378


#### 也可以用自定义函数

In [69]:
people.groupby(key).transform(peak_to_peak)

Unnamed: 0,a,b,c,d,e
Joe,1.704219,0.81164,2.659144,0.551492,2.497867
Steve,0.680949,0.518617,1.37406,1.194659,1.60736
Wes,1.704219,0.81164,2.659144,0.551492,2.497867
Jim,0.680949,0.518617,1.37406,1.194659,1.60736
Travis,1.704219,0.81164,2.659144,0.551492,2.497867


#### apply 会将待处理的对象拆成多个片段，然后对各片段调用传入的函数，最后尝试将各片段组合到一起

In [71]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [74]:
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [76]:
# top函数的其他参数，放在apply内容里面一并传入
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982


#### 禁止分组键：将group_keys=False传入即可

In [78]:
tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


#### pandas的cut和qcut将数据拆成多块，结合groupby，就可以轻松实现对数据集的bucket桶或quantile分位数分析

In [81]:
frame = DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]

0     (-1.603, 0.212]
1     (-1.603, 0.212]
2      (0.212, 2.027]
3      (0.212, 2.027]
4    (-3.425, -1.603]
5      (0.212, 2.027]
6      (2.027, 3.841]
7      (0.212, 2.027]
8      (0.212, 2.027]
9     (-1.603, 0.212]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.425, -1.603] < (-1.603, 0.212] < (0.212, 2.027] < (2.027, 3.841]]

In [83]:
def get_stats(group):
    return {'min':group.min(),
           'max':group.max(),
           'count':group.count(),
           'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.425, -1.603]",59.0,1.995209,-0.03952,-2.46466
"(-1.603, 0.212]",526.0,2.896515,0.030908,-2.731598
"(0.212, 2.027]",395.0,2.733377,-0.01194,-2.802933
"(2.027, 3.841]",20.0,1.370551,-0.025093,-1.852799


#### 用特定的值（如平均值）去填充缺失值，使用fillna方法

In [85]:
s = Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    1.067234
2         NaN
3    0.049589
4         NaN
5   -1.988898
dtype: float64

In [87]:
s.fillna(s.mean())

0   -0.290692
1    1.067234
2   -0.290692
3    0.049589
4   -0.290692
5   -1.988898
dtype: float64

#### 如果要对不同分组的NA值填充对应分组的平均值，那么只需要将数据分组，并使用apply和一个能够对各数据块调用fillna的函数即可

In [90]:
states = ['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key = ['East']*4 +['West']*4
data = Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data

Ohio          0.114991
New York     -0.292558
Vermont            NaN
Florida      -0.812552
Oregon       -0.728233
Nevada             NaN
California    1.374858
Idaho              NaN
dtype: float64

In [92]:
data.groupby(group_key).mean()

East   -0.330040
West    0.323312
dtype: float64

In [94]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio          0.114991
New York     -0.292558
Vermont      -0.330040
Florida      -0.812552
Oregon       -0.728233
Nevada        0.323312
California    1.374858
Idaho         0.323312
dtype: float64

### 分组加权平均数和相关系数

In [105]:
close_px = pd.read_csv('./data/ch09/stock_px.csv',parse_dates=True,index_col=0)
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [108]:
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [110]:
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## 透视表和交叉表

#### DataFrame 有一个pivot_table 方法，此外还有一个顶级的pandas.pivot_table函数

In [111]:
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587


In [114]:
tips.pivot_table(index=['sex','smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,No,2.592593,2.773519,0.156921,18.105185
Female,Yes,2.242424,2.931515,0.18215,17.977879
Male,No,2.71134,3.113402,0.160669,19.791237
Male,Yes,2.5,3.051167,0.152771,22.2845


In [119]:
# 添加margins=True，会添加标签为all的行和列，这里的all值为平均数
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,All,No,Yes,All
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Female,Fri,2.5,2.0,2.111111,0.165296,0.209129,0.199388
Female,Sat,2.307692,2.2,2.25,0.147993,0.163817,0.15647
Female,Sun,3.071429,2.5,2.944444,0.16571,0.237075,0.181569
Female,Thur,2.48,2.428571,2.46875,0.155971,0.163073,0.157525
Male,Fri,2.0,2.125,2.1,0.138005,0.14473,0.143385
Male,Sat,2.65625,2.62963,2.644068,0.162132,0.139067,0.151577
Male,Sun,2.883721,2.6,2.810345,0.158291,0.173964,0.162344
Male,Thur,2.5,2.3,2.433333,0.165706,0.164417,0.165276
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803


#### 要使用其他的聚合函数，将其传给aggfunc即可

In [121]:
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc=len,margins=True)

Unnamed: 0_level_0,day,Fri,Sat,Sun,Thur,All
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,No,2.0,13.0,14.0,25.0,54.0
Female,Yes,7.0,15.0,4.0,7.0,33.0
Male,No,2.0,32.0,43.0,20.0,97.0
Male,Yes,8.0,27.0,15.0,10.0,60.0
All,,19.0,87.0,76.0,62.0,244.0


* values 待聚合的列的名称，默认聚合所有数值列
* index 用于分组的列名，出现在结果透视表的行
* columns 用于分组的列名，出现在结果透视表的列
* aggfunc 聚合函数，默认为mean
* fill_value 用于替换结果表中的缺失值
* margins 添加行/列小计和总计，默认为False

### 交叉表：crosstab
#### 是一种用于计算分组频率的特殊透视表

In [126]:
data

Ohio          0.114991
New York     -0.292558
Vermont            NaN
Florida      -0.812552
Oregon       -0.728233
Nevada             NaN
California    1.374858
Idaho              NaN
dtype: float64