# 4 Pandas的索引操作

In [1]:
import pandas as pd
import numpy as np

In [2]:

dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)

Index([0, 1, 2, 3], dtype='int64')


In [3]:
# 索引对象的值不可变（上面代码增加）
# df_obj2.index[0] = 2

In [3]:
df_obj2

Unnamed: 0,A,B,C,D,E,F
0,1,2019-09-26,1.0,1,Python,wangdao
1,1,2019-09-26,1.0,2,Java,wangdao
2,1,2019-09-26,1.0,3,C++,wangdao
3,1,2019-09-26,1.0,4,C,wangdao


3 常见的Index种类
•Index，索引  可以是各种类型
•Int64Index，整数索引
•MultiIndex，层级索引，难度较大
•DatetimeIndex，时间戳类型

In [4]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [8]:
# 行索引，不仅可以用索引名，可以用索引位置或来取
print(ser_obj['b']) #索引名
print(ser_obj.iloc[2]) #索引位置

1
2


In [9]:
# 切片索引
print(ser_obj[1:3])  #索引位置取数据，左闭右开
print(ser_obj['b':'d'])  #记住索引名  左闭右闭

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [11]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [12]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_obj)
print(ser_bool)
print('-'*50)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2]) #取出大于2的元素

a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool
--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## 4.4 DataFrame索引

In [13]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -1.072744  0.909464 -0.304105  0.613053
1  0.204402 -1.512798 -0.510262 -0.120146
2 -0.860123  0.336391 -0.499397 -0.224446
3 -0.825789  0.628703 -0.337707 -1.192812
4 -2.072761 -1.774720 -0.101769  0.580371


In [15]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 返回DataFrame类型
print('-'*50)


0   -1.072744
1    0.204402
2   -0.860123
3   -0.825789
4   -2.072761
Name: a, dtype: float64
--------------------------------------------------
          a
0 -1.072744
1  0.204402
2 -0.860123
3 -0.825789
4 -2.072761
--------------------------------------------------


In [16]:
print(type(df_obj[['a']])) # 返回DataFrame类型

<class 'pandas.core.frame.DataFrame'>


1. loc 标签索引(通过索引标签值获取数据)

In [17]:
# 标签索引 loc，建议使用loc，效率更高
# Series
print(ser_obj)
print(ser_obj['b':'d']) #不建议
print(ser_obj.loc['b':'d']) #前闭后闭
print('-'*50)


a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [18]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = list('abcd'),
                      index=list('abcde'))
print(df_obj)
print('-'*50)
print(df_obj['a'])  #建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)


          a         b         c         d
a -0.999098 -1.831602 -1.495320 -0.134011
b  1.100559  0.067801 -0.753872 -2.196088
c  1.239953  1.177998  2.279126 -0.778878
d  0.396392 -1.005760 -0.530320  0.958080
e  0.388883 -0.103025  0.526778 -0.412634
--------------------------------------------------
a   -0.999098
b    1.100559
c    1.239953
d    0.396392
e    0.388883
Name: a, dtype: float64
--------------------------------------------------
a   -0.999098
b   -1.831602
c   -1.495320
d   -0.134011
Name: a, dtype: float64
--------------------------------------------------


In [19]:
# 第一个参数索引行，第二个参数是列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print(df_obj.loc[['c'],['b']]) #取一个值,返回的是DataFrame类型
print(df_obj.loc['c','b'])  #取一个值

          b         c         d
a -1.831602 -1.495320 -0.134011
b  0.067801 -0.753872 -2.196088
c  1.177998  2.279126 -0.778878
          b         d
a -1.831602 -0.134011
c  1.177998 -0.778878
          b
c  1.177998
1.1779977057251914


## iloc 位置索引(推荐使用)

In [20]:
ser_obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [7]:
df_obj

Unnamed: 0,a,b,c,d
a,0.722203,0.710413,-0.415365,-0.115376
b,-1.094396,0.902924,-1.357799,0.607512
c,-1.126615,-1.538308,-0.593946,-1.067405
d,-2.039717,-0.093051,1.797557,0.957792
e,1.311567,-0.856298,-0.460529,0.705077


In [21]:
# Series
print(ser_obj[1:3])
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高
print('-'*50)

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
--------------------------------------------------


In [22]:
# DataFrame，iloc是前闭后开[)
print(df_obj)
print(df_obj.iloc[0:2, 0:2]) # 注意和df_obj.loc[0:2, 'a']的区别
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a -0.999098 -1.831602 -1.495320 -0.134011
b  1.100559  0.067801 -0.753872 -2.196088
c  1.239953  1.177998  2.279126 -0.778878
d  0.396392 -1.005760 -0.530320  0.958080
e  0.388883 -0.103025  0.526778 -0.412634
          a         b
a -0.999098 -1.831602
b  1.100559  0.067801
          a         c
a -0.999098 -1.495320
c  1.239953  2.279126
-0.9990977981235025


In [23]:
#没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #左闭右开
print('-'*50)
print(df_obj2.loc[0:2]) #左闭右闭

          0         1         2         3
0 -0.121568 -0.131784  0.504275  0.314764
1  3.037141  0.790205 -0.088647 -2.677519
2 -0.298099  1.137497  0.866289  0.823224
3 -0.328559  0.773870 -0.721010  0.964544
4 -0.388196  1.634064  1.467206 -0.339671
--------------------------------------------------
          0         1         2         3
0 -0.121568 -0.131784  0.504275  0.314764
1  3.037141  0.790205 -0.088647 -2.677519
--------------------------------------------------
          0         1         2         3
0 -0.121568 -0.131784  0.504275  0.314764
1  3.037141  0.790205 -0.088647 -2.677519
2 -0.298099  1.137497  0.866289  0.823224


# 5.对齐运算(不重要，放到最后讲解）

In [10]:
import pandas as pd
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
# Series 对齐运算
print('s1+s2: ')
s3=s1+s2
print(s3)  #缺失数据默认是NaN  np.nan

s1+s2: 
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [16]:
#两个长度不同的一维ndarray相加
a1 = np.array([1,2,3,4,5])
a2 = np.array([1]) # 长度为1
print(a2.shape)
print(a1+a2)

(1,)
[2 3 4 5 6]


In [18]:
print(np.isnan(s3[6]))
print('-'*50)
print(s2.add(s1, fill_value = 0))  #未对齐的数据将和填充值做运算
print(s2.sub(s1, fill_value = 0))

True
--------------------------------------------------
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    15.0
6    16.0
7    17.0
8    18.0
9    19.0
dtype: float64
0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5   -15.0
6   -16.0
7   -17.0
8   -18.0
9   -19.0
dtype: float64


In [19]:
#df的对齐运算
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print(df1)
print(df2)
print('-'*50)
print(df2.dtypes)
print(df1-df2)
print(df2.sub(df1, fill_value = 2)) #未对齐的数据将和填充值做运算

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
--------------------------------------------------
a    float64
b    float64
c    float64
dtype: object
     a    b   c
0  0.0  0.0 NaN
1  0.0  0.0 NaN
2  NaN  NaN NaN
     a    b    c
0  0.0  0.0 -1.0
1  0.0  0.0 -1.0
2 -1.0 -1.0 -1.0
