In [1]:
import numpy as np
import pandas as pd

## 5.2.3 索引，选择，过滤

### Series

In [27]:
ser = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
ser

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [28]:
ser['b']

1.0

In [29]:
ser[1]

1.0

In [30]:
ser[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [31]:
ser[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [32]:
ser[1 : 3]

b    1.0
c    2.0
dtype: float64

In [33]:
# 用label来slicing(切片)的时候，和python的切片不一样的在于，会包括尾节点：
ser['a':'d']

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [34]:
ser < 2

a     True
b     True
c    False
d    False
dtype: bool

In [35]:
ser[ser < 2]

a    0.0
b    1.0
dtype: float64

In [36]:
# 可以直接给选中的label更改值：
ser['b':'c'] = 555
ser

a      0.0
b    555.0
c    555.0
d      3.0
dtype: float64

### DataFrame

In [37]:
df = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [38]:
df['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [39]:
df[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [40]:
# df[[1, 2]]  ERROR
# df[1]  ERROR
df[1:3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


In [41]:
df.loc['Ohio']

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32

In [42]:
df.loc[['Ohio', 'Colorado']]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


dataframe的indexing有一些比较特别的方式。比如通过布尔数组：

In [43]:
df['three'] > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [44]:
df[df['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [45]:
df < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [46]:
df[df < 5] = 0
df

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## 用loc和iloc来选择

In [47]:
df

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [48]:
# 选1行数据
print(df.iloc[1])

one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32


In [49]:
# print(df.loc[1])

In [50]:
# 选多行数据
print(df[:2], type(df[:2]), sep='\n')
df.loc[['Ohio', 'Colorado']]

          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7


In [51]:
# 选一行中的多列数据
print(df.loc['Colorado', ['two', 'three']], '\n')
print(df.iloc[1, [1, 2]], '\n')
print(df.iloc[1, 1:3], '\n')

two      5
three    6
Name: Colorado, dtype: int32 

two      5
three    6
Name: Colorado, dtype: int32 

two      5
three    6
Name: Colorado, dtype: int32 



In [52]:
# 选择一列
print(df['two'])

Ohio         0
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32


In [53]:
# 选择多列
print(df[['two', 'three']])

          two  three
Ohio        0      0
Colorado    5      6
Utah        9     10
New York   13     14


In [54]:
#选择一列的多行数据
print(df.loc[:'Utah', 'two'])

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32


In [55]:
df

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [56]:
# 选择多行多列数据
print(type(df.iloc[[1, 2], [0, 2]]), '\n', df.iloc[[1, 2], [0, 2]], '\n')
print(type(df.iloc[:2, :3]), '\n', df.iloc[:2, :3])

<class 'pandas.core.frame.DataFrame'> 
           one  three
Colorado    0      6
Utah        8     10 

<class 'pandas.core.frame.DataFrame'> 
           one  two  three
Ohio        0    0      0
Colorado    0    5      6


In [57]:
# 具体数据本身
print(df.iloc[1, 1])

5


In [58]:
# 过滤
df.iloc[:, :3][df.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [59]:
df

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [60]:
df.loc['Colorado', ['two', 'three']] 
# 虽然ix也可以实现一些操作，但是已经被废弃了不建议再使用

two      5
three    6
Name: Colorado, dtype: int32

In [61]:
del ser, df