## 丢弃指定轴上的项

In [6]:
import pandas as pd
import numpy as np
obj = pd.Series(np.arange(5.),index = ['a', 'b','c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [8]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [9]:
obj.drop(['d','e'])

a    0.0
b    1.0
c    2.0
dtype: float64

* 对于DataFrame可以删除任意轴上的索引

In [17]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [18]:
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


* 通过传递axis=1或者axis='columns'可以删除列的值：

In [19]:
data.drop('two',axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


* 以上操作会返回一个新的对象，原数据结构并不变化，但是可以通过传入inplace参数就地修改对象

In [20]:
data.drop('one',axis=1,inplace=True)
data

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


## 索引、选取、过滤
* Series索引和numpy数组的索引差不多，只不过Series的索引值不一定是整数

In [21]:
obj = pd.Series(np.arange(4), index = ['a','b','c','d'])
obj

a    0
b    1
c    2
d    3
dtype: int32

In [22]:
obj['b']

1

In [23]:
obj[2:4]

c    2
d    3
dtype: int32

In [26]:
obj[['a','c','d']]

a    0
c    2
d    3
dtype: int32

* 利用标签的切片运算和普通的python不同，末端是包含的

In [28]:
obj['a':'c']

a    0
b    1
c    2
dtype: int32

In [30]:
obj['a':'c'] = 5
obj

a    5
b    5
c    5
d    3
dtype: int32

### DataFrame
* 利用一个值或者序列对DataFrame进行索引其实就是获取一个或者多个列

In [31]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four']
                   )
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [36]:
data[['three','two']]

Unnamed: 0,three,two
Ohio,2,1
Colorado,6,5
Utah,10,9
New York,14,13


In [37]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [38]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [39]:
# 利用布尔型DataFrame进行索引
data<5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [41]:
data[data<5]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,,,
Utah,,,,
New York,,,,


In [43]:
data[data<6]=0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,0,6,7
Utah,8,9,10,11
New York,12,13,14,15


## 使用loc和iloc进行选取
* 对于DataFrame的行的标签索引，我引入了特殊的标签运算符loc和iloc。它们可以让你用类似NumPy的标记，使用轴标签（loc）或整数索引（iloc），从DataFrame选择行和列的子集。

In [45]:
data.loc['Colorado',['two','three']]

two      0
three    6
Name: Colorado, dtype: int32

In [46]:
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

* loc和iloc函数也适用于切片索引

In [47]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    0
Utah        9
Name: two, dtype: int32

In [48]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,0,0,6
Utah,8,9,10
New York,12,13,14


![name](https://upload-images.jianshu.io/upload_images/7178691-64354f2ab777bd8c.png)


## 整数索引

In [57]:
# 对于非整数索引不会产生歧义
ser = pd.Series(np.arange(3.0),index=['a', 'b', 'c'])
ser

a    0.0
b    1.0
c    2.0
dtype: float64

In [58]:
ser[-1]

2.0

* 为了进行统一，如果轴索引含有整数，数据总会使用标签。为了更加准确，推荐使用loc(标签索引)和iloc(整数索引)

In [62]:
# 对于整数索引，会产生索引
ser2 = pd.Series(np.arange(3.0))
ser2

0    0.0
1    1.0
2    2.0
dtype: float64

In [63]:
ser2.iloc[:1]

0    0.0
dtype: float64

In [65]:
ser2.loc[:1]

0    0.0
1    1.0
dtype: float64

In [66]:
ser2[:1]

0    0.0
dtype: float64

# 数据运算和数据对齐
* pandas的一个重要的功能是可以对不同的索引的对象进行算术运算。在将对象相加时，如果存在不同的索引对，则结果的索引就是该索引对的并集

In [67]:
s1 = pd.Series([7.3,-2.5,3.4,1.5], index=['a','b','c','d'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1

a    7.3
b   -2.5
c    3.4
d    1.5
dtype: float64

In [68]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [69]:
s1 + s2

a    5.2
b    NaN
c    7.0
d    NaN
e    NaN
f    NaN
g    NaN
dtype: float64

* 对于DataFrame 对齐操作同时作用于行和列上

In [70]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [71]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [72]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [73]:
# DataFrame对象相加，没有共用的列或行标签，结果都会是空：
df1 = pd.DataFrame({'A':[1,2]})
df2 = pd.DataFrame({'B':[3,4]})
df1

Unnamed: 0,A
0,1
1,2


In [74]:
df2

Unnamed: 0,B
0,3
1,4


In [75]:
df1+df2

Unnamed: 0,A,B
0,,
1,,


# 在算术方法中填充值
* 在对具有不同索引的对象进行算术运算时，希望对不同是具有的轴标签赋值（比如：0）

In [80]:
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20).reshape(4,5),columns=list('abcde'))
df2.loc[1,'b'] = np.nan
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [81]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1.0,2,3,4
1,5,,7,8,9
2,10,11.0,12,13,14
3,15,16.0,17,18,19


In [83]:
df1+df2 # 引入了nan，但是相对nan赋值 

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [84]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [85]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [86]:
df1.rdiv(1) #相当于1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


* 表5-4列出了Series和DataFrame所支持的算术运算
![5-4](https://upload-images.jianshu.io/upload_images/7178691-16857a1021f98d1f.png)

* 对于Series和DataFrame重新索引时，也可以指定一个填充值

In [87]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


# DataFrame和Series之间的运算

In [88]:
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [89]:
arr[0]

array([0., 1., 2., 3.])

In [92]:
np.array(arr[0])

array([0., 1., 2., 3.])

In [93]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

* 上面的启发性例子，当两个维数不一样的数组相减时，每一行都会减，这叫做广播。DataFrame和Series之间的运算差不多

In [94]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),
                    columns=list('bde'),
                    index=['Utah','Ohio','Texas','Oregon'])
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [95]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [96]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


* 如果某个索引值在DataFrame的列或者Series的索引中找不到，则参与运算的两个对象就会被重新索引以形成并集：

In [98]:
series2 = pd.Series(range(3),index=['b','e','f'])
series2

b    0
e    1
f    2
dtype: int64

In [99]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


* 如果希望匹配行且在列上进行广播，则必须使用算术运算方法

In [101]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [102]:
frame.sub(series3,axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0
