# 4 Pandas的索引操作

In [1]:
import pandas as pd
import numpy as np

In [3]:

dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'wangdao' }
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2.index)
print(df_obj2)

Index([0, 1, 2, 3], dtype='int64')
   A          B    C  D       E        F
0  1 2019-09-26  1.0  1  Python  wangdao
1  1 2019-09-26  1.0  2    Java  wangdao
2  1 2019-09-26  1.0  3     C++  wangdao
3  1 2019-09-26  1.0  4       C  wangdao


In [4]:
# 索引对象的值不可变（上面代码增加）
# df_obj2.index[0] = 2

TypeError: Index does not support mutable operations

# 3 常见的Index种类
•Index，索引  可以是各种类型
•Int64Index，整数索引
•MultiIndex，层级索引，难度较大
•DatetimeIndex，时间戳类型

In [5]:
ser_obj = pd.Series(range(5), index = list("abcde"))
print(ser_obj)
ser_obj.index

a    0
b    1
c    2
d    3
e    4
dtype: int64


Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
# 行索引，不仅可以用索引名，可以用索引位置或来取
print(ser_obj['b']) #索引名
print(ser_obj[2]) #位置索引

1
2


  print(ser_obj[2]) #位置索引


In [7]:
# 切片索引
print(ser_obj.iloc[1:3])  #索引位置取数据，左闭右开
print(ser_obj.loc['b':'d'])  #记住索引名  左闭右闭

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [8]:
# 不连续索引
print(ser_obj.iloc[[0, 2, 4]])
print(ser_obj.loc[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [9]:
# 布尔索引
ser_bool = ser_obj > 2 # 得到一个布尔型Series
print(ser_obj)
print(ser_bool)


a    0
b    1
c    2
d    3
e    4
dtype: int64
a    False
b    False
c    False
d     True
e     True
dtype: bool


In [10]:
print('-'*50)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2]) #取出大于2的元素

--------------------------------------------------
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


## 4.4 DataFrame索引

In [11]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0  2.734458  0.147175  0.837429  0.352799
1 -0.085030 -0.092779 -0.786356 -0.161821
2  2.650033  1.677206  1.161337  0.267197
3  0.176751  0.046350  2.075486  0.736642
4 -1.512489 -0.873439  0.634075  0.006068


In [12]:
# 列索引
print(df_obj['a']) # 返回Series类型
print('-'*50)
print(df_obj[['a']]) # 返回DataFrame类型
print('-'*50)
print(type(df_obj[['a']])) # 返回DataFrame类型

0    2.734458
1   -0.085030
2    2.650033
3    0.176751
4   -1.512489
Name: a, dtype: float64
--------------------------------------------------
          a
0  2.734458
1 -0.085030
2  2.650033
3  0.176751
4 -1.512489
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>


1. loc 标签索引(通过索引标签值获取数据)

In [13]:
# 标签索引 loc，建议使用loc，效率更高
# Series
print(ser_obj)
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d']) #前闭后闭
print('-'*50)

a    0
b    1
c    2
d    3
e    4
dtype: int64
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
--------------------------------------------------


In [14]:
# DataFrame
df_obj = pd.DataFrame(np.random.randn(5,4),
                      columns = list('abcd'),
                      index=list('abcde'))
print(df_obj)
print('-'*50)
print(df_obj['a'])  #建议不用,拿的是列
print('-'*50)
print(df_obj.loc['a'])  #拿的是行
print('-'*50)

          a         b         c         d
a  1.422436  0.087032 -1.441688 -1.091036
b  0.304637 -0.070072  0.698863  1.893275
c -0.892233  0.065410 -1.748221  0.694030
d  0.567903 -0.015578  0.864070  0.810692
e -0.607475 -0.379461 -1.347855  0.305613
--------------------------------------------------
a    1.422436
b    0.304637
c   -0.892233
d    0.567903
e   -0.607475
Name: a, dtype: float64
--------------------------------------------------
a    1.422436
b    0.087032
c   -1.441688
d   -1.091036
Name: a, dtype: float64
--------------------------------------------------


In [17]:
# 第一个参数索引行，第二个参数是列,loc或者iloc效率高于直接用取下标的方式，前闭后闭
print(df_obj)
print('-'*50)
print(df_obj.loc['a':'c', 'b':'d']) #连续索引
print('-'*50)
print(df_obj.loc[['a','c'], ['b','d']]) #不连续索引
print('-'*50)
print(df_obj.loc[['c'],['b']]) #取一个值,返回的是DataFrame类型
print('-'*50)
print(df_obj.loc['c','b'])  #取一个值
print('-'*50)

          a         b         c         d
a  1.422436  0.087032 -1.441688 -1.091036
b  0.304637 -0.070072  0.698863  1.893275
c -0.892233  0.065410 -1.748221  0.694030
d  0.567903 -0.015578  0.864070  0.810692
e -0.607475 -0.379461 -1.347855  0.305613
--------------------------------------------------
          b         c         d
a  0.087032 -1.441688 -1.091036
b -0.070072  0.698863  1.893275
c  0.065410 -1.748221  0.694030
--------------------------------------------------
          b         d
a  0.087032 -1.091036
c  0.065410  0.694030
--------------------------------------------------
         b
c  0.06541
--------------------------------------------------
0.06540997786943877
--------------------------------------------------


## iloc 位置索引(推荐使用)

In [18]:
ser_obj
print('-'*50)
# Series
print(ser_obj[1:3])
print('-'*50)
print(ser_obj.iloc[1:3]) # 前闭后开[)，效率高

--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
dtype: int64


In [19]:
df_obj

Unnamed: 0,a,b,c,d
a,1.422436,0.087032,-1.441688,-1.091036
b,0.304637,-0.070072,0.698863,1.893275
c,-0.892233,0.06541,-1.748221,0.69403
d,0.567903,-0.015578,0.86407,0.810692
e,-0.607475,-0.379461,-1.347855,0.305613


In [20]:
# DataFrame，iloc是前闭后开[)
print(df_obj)
print('-'*50)
print(df_obj.iloc[0:2, 0:2]) # 连续索引
print('-'*50)
print(df_obj.iloc[[0,2], [0,2]]) # 不连续索引
print('-'*50)
print(df_obj.iloc[0,0]) # 取一个值

          a         b         c         d
a  1.422436  0.087032 -1.441688 -1.091036
b  0.304637 -0.070072  0.698863  1.893275
c -0.892233  0.065410 -1.748221  0.694030
d  0.567903 -0.015578  0.864070  0.810692
e -0.607475 -0.379461 -1.347855  0.305613
--------------------------------------------------
          a         b
a  1.422436  0.087032
b  0.304637 -0.070072
--------------------------------------------------
          a         c
a  1.422436 -1.441688
c -0.892233 -1.748221
--------------------------------------------------
1.4224362934118255


In [21]:
#没有设置行和列索引的DataFrame，iloc和loc的区别
df_obj2 = pd.DataFrame(np.random.randn(5,4))
print(df_obj2)
print('-'*50)
print(df_obj2.iloc[0:2]) #左闭右开 2行
print('-'*50)
print(df_obj2.loc[0:2]) #左闭右闭 3行

          0         1         2         3
0  0.357819  0.752548  1.232526 -0.865646
1  0.172911  0.483154 -0.135426 -0.975568
2  1.666272  0.354764 -1.307799 -0.571219
3 -1.027573 -1.372882  1.037684  0.358516
4  0.038318 -0.186146 -0.642466  1.873084
--------------------------------------------------
          0         1         2         3
0  0.357819  0.752548  1.232526 -0.865646
1  0.172911  0.483154 -0.135426 -0.975568
--------------------------------------------------
          0         1         2         3
0  0.357819  0.752548  1.232526 -0.865646
1  0.172911  0.483154 -0.135426 -0.975568
2  1.666272  0.354764 -1.307799 -0.571219


# 5 对齐运算

In [23]:
# 导入 pandas 库
import pandas as pd

# 创建第一个 Series 对象 s1
# 数据范围是 10 到 19，索引范围是 0 到 9
s1 = pd.Series(range(10, 20), index=range(10))

# 创建第二个 Series 对象 s2
# 数据范围是 20 到 24，索引范围是 0 到 4
s2 = pd.Series(range(20, 25), index=range(5))

# 对 Series 进行对齐运算
print('s1 + s2: ')
s3 = s1 + s2  # 根据索引对齐并进行加法运算

# 输出结果
print(s3)  # 缺失的数据（无法对齐的索引）结果为 NaN，即 pandas 使用 np.nan 表示缺失值

s1 + s2: 
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [25]:
#两个长度不同的一维ndarray相加
a1 = np.array([1,2,3,4,5])
a2 = np.array([1]) # 长度为1
print(a1)
print(a2)
print(a2.shape)
print(a1+a2)

[1 2 3 4 5]
[1]
(1,)
[2 3 4 5 6]


In [26]:
print(s2)
s1

0    20
1    21
2    22
3    23
4    24
dtype: int64


0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In [27]:
print(np.isnan(s3[6]))
print('-'*50)
print(s2.add(s1, fill_value = 0))  #未对齐的数据将和填充值做运算
print(s2.sub(s1, fill_value = 0))

True
--------------------------------------------------
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    15.0
6    16.0
7    17.0
8    18.0
9    19.0
dtype: float64
0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5   -15.0
6   -16.0
7   -17.0
8   -18.0
9   -19.0
dtype: float64


In [30]:
#df的对齐运算
import numpy as np
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print(df1)
print(df2)
print('-'*50)
print(df2.dtypes)
print(df1-df2)
print(df2.sub(df1, fill_value = 2)) #未对齐的数据将和填充值做运算

     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
--------------------------------------------------
a    float64
b    float64
c    float64
dtype: object
     a    b   c
0  0.0  0.0 NaN
1  0.0  0.0 NaN
2  NaN  NaN NaN
     a    b    c
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0


# 总结：没对齐的元素，默认填充NaN，对齐运算时，fill_value参数可以指定填充值。