In [1]:
import numpy as np
import pandas as pd

## 5.2.5 算术运算和数据对齐

In [62]:
ser1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
ser1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [63]:
ser2 = pd.Series([2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
ser2

a    2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [64]:
ser1 + ser2

a    9.4
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

#### 在DataFrame中，数据对齐同时发生在行和列上

In [65]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), \
                   index=['北京', '上海', '广州'])
df1

Unnamed: 0,b,c,d
北京,0.0,1.0,2.0
上海,3.0,4.0,5.0
广州,6.0,7.0,8.0


In [66]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), \
                   index=['北京', '上海', '广州', '深圳'])
df2

Unnamed: 0,b,d,e
北京,0.0,1.0,2.0
上海,3.0,4.0,5.0
广州,6.0,7.0,8.0
深圳,9.0,10.0,11.0


In [67]:
df1 + df2

Unnamed: 0,b,c,d,e
上海,6.0,,9.0,
北京,0.0,,3.0,
广州,12.0,,15.0,
深圳,,,,


如果两个DataFrame相加，而且没有相同的column和row，结果会全是null：

In [68]:
df1 = pd.DataFrame({'A': [1, 2]})
df1

Unnamed: 0,A
0,1
1,2


In [69]:
df2 = pd.DataFrame({'B': [3, 4]})
df2

Unnamed: 0,B
0,3
1,4


In [70]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


In [71]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [72]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [73]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


#### 使用fill_value对于上面那些缺失值填上0

In [74]:
df1.add(df2, fill_value=0) # 找不到索引时填充0

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


#### 每一个都有一个配对的，以 r 开头，意思是反转：

In [75]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [76]:
1 / df1  ## ===  df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


在reindex（重建索引）的时候，也可以使用fill_value:

In [77]:
df1.reindex(columns=['a', 'b', 'c', 'd', 'e'], fill_value=666)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,666
1,4.0,5.0,6.0,7.0,666
2,8.0,9.0,10.0,11.0,666


### 5.2.5.3 DataFrame和Series之间的操作

先举个numpy的例子帮助理解，可以考虑成一个二维数组和它的一行：

In [78]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [79]:
arr[0]

array([0., 1., 2., 3.])

In [80]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

可以看到，这个减法是用在了每一行上。这种操作叫broadcasting（广播），在Appendix A有更详细的解释。DataFrame和Series的操作也类似：

In [81]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [82]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

可以理解为series的index与dataframe的列匹配，broadcasting down the rows(向下按行广播):

In [83]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


如果一个index既不在DataFrame的column中，也不再series里的index中，那么结果也是合集：

In [84]:
series2 = pd.Series([10, 100, 1000], index=['b', 'e', 'f'])
series2

b      10
e     100
f    1000
dtype: int64

In [85]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [86]:
frame + series2 #==frame.add(series2, axis='columns')==frame.add(series2, axis=1)

Unnamed: 0,b,d,e,f
Utah,10.0,,102.0,
Ohio,13.0,,105.0,
Texas,16.0,,108.0,
Oregon,19.0,,111.0,


In [87]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [88]:
series2 = pd.Series([10, 100, 1000], index=['Utah', 'Ohio', 'Texas'])
series2

Utah       10
Ohio      100
Texas    1000
dtype: int64

In [89]:
frame.add(series2, axis='index')# ==frame.add(series2, axis=0)

Unnamed: 0,b,d,e
Ohio,103.0,104.0,105.0
Oregon,,,
Texas,1006.0,1007.0,1008.0
Utah,10.0,11.0,12.0


如果想要广播列，去匹配行，必须要用到算数方法：

In [90]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [91]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [92]:
frame.sub(series3, axis='index') # 默认 axis=clomuns

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


axis参数就是用来匹配轴的。在这个例子里是匹配dataframe的row index(`axis='index` or `axis=0`)，然后再广播。