# Pandas 数据操作

In [8]:
import pandas as pd
import numpy as np

### 1. Series索引

In [3]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [4]:
# 行索引 -- use index or location

print(ser_obj['a'])
print(ser_obj[0])

0
0


In [5]:
# 切片索引

print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [6]:
# 不连续索引

print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [7]:
# 布尔索引

ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


### 2. DataFrame索引

####  列索引优先
- df_obj['label']
- 只放一个label 列索引优先

In [9]:

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -0.471783  0.483099  0.649799 -1.642552
1  0.627565 -1.240381  1.343361 -0.212816
2  0.536110 -1.771478 -0.647025 -1.142757
3 -1.005834 -0.891399 -0.371497 -0.829851
4  1.294929 -0.050468  0.626883  2.178929


In [15]:
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型


# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])


列索引
0   -0.471783
1    0.627565
2    0.536110
3   -1.005834
4    1.294929
Name: a, dtype: float64
不连续索引
          a         c
0 -0.471783  0.649799
1  0.627565  1.343361
2  0.536110 -0.647025
3 -1.005834 -0.371497
4  1.294929  0.626883


### 两种索引方式

In [11]:
print(ser_obj)
print(df_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int32
          a         b         c         d
0  2.449620  0.075808 -1.599173  0.387197
1  0.043107  0.332700 -1.067877 -0.637562
2  0.251890 -1.544794 -0.739113  0.610947
3  0.718981  0.956047 -0.054310 -0.430208
4  0.172346  0.583391 -0.179050 -0.468465


#### 1. 标签索引 loc

In [24]:

# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a']) # 'a'的前三行

b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
0   -0.471783
1    0.627565
2    0.536110
3   -1.005834
4    1.294929
Name: a, dtype: float64
0   -0.471783
1    0.627565
2    0.536110
Name: a, dtype: float64


#### 2. 位置索引 .iloc

In [25]:
print(ser_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [26]:
# DataFrame

print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别

0   -0.471783
1    0.627565
Name: a, dtype: float64


### 运算与对齐

In [31]:
# Series 对齐操作

s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int64


In [28]:
# Series 对齐运算

s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [29]:


df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [30]:
# DataFrame对齐操作

df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [32]:
# 填充未对齐的数据进行运算 fill_value = 

print(s1)
print(s2)

s1.add(s2, fill_value = -1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
0    20
1    21
2    22
3    23
4    24
dtype: int64


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [33]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [34]:
# 填充NaN fillna()

s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [35]:
s3_filled = s3.fillna(-1)
print(s3_filled)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [36]:
df3 = df1 + df2
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [37]:
df3.fillna(100, inplace = True)
print(df3)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


### 函数应用

In [38]:
# Numpy ufunc 函数

df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -0.481279  0.557730 -1.115213 -0.564530
1 -0.747977 -0.166946 -1.251016 -1.241585
2  0.714697 -1.846246 -0.892217 -0.982949
3 -0.323409 -0.190697 -0.651720 -1.212122
4 -0.389875 -1.957013 -1.194209 -0.092693
          0         1         2         3
0  0.481279  0.557730  1.115213  0.564530
1  0.747977  0.166946  1.251016  1.241585
2  0.714697  1.846246  0.892217  0.982949
3  0.323409  0.190697  0.651720  1.212122
4  0.389875  1.957013  1.194209  0.092693


In [39]:
# 使用apply应用行或列数据

f = lambda x : x.max()
#def f(x):
#    return x.max()

print(df.apply(f))

0    0.714697
1    0.557730
2   -0.651720
3   -0.092693
dtype: float64


In [25]:
# 指定轴方向

print(df.apply(lambda x : x.max(), axis=1))

0    0.152272
1   -0.734763
2   -0.780086
3   -1.459251
4    0.262559
dtype: float64


In [40]:
# 使用applymap应用到每个数据

f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

       0      1      2      3
0  -0.48   0.56  -1.12  -0.56
1  -0.75  -0.17  -1.25  -1.24
2   0.71  -1.85  -0.89  -0.98
3  -0.32  -0.19  -0.65  -1.21
4  -0.39  -1.96  -1.19  -0.09


### 排序

In [41]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

1    10
1    11
4    12
1    13
1    14
dtype: int64


In [43]:
# 索引排序 sort_index()

s4.sort_index(ascending=False)

4    12
1    10
1    11
1    13
1    14
dtype: int64

In [45]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          0         2         0         1
1 -0.195859 -0.942873 -1.106329 -0.886807
2  2.175211  1.483963  0.437963  1.199707
2 -0.805194  1.065535 -0.105788 -0.734886


In [46]:

df4.sort_index(axis=1)

Unnamed: 0,0,0.1,1,2
1,-0.195859,-1.106329,-0.886807,-0.942873
2,2.175211,0.437963,1.199707,1.483963
2,-0.805194,-0.105788,-0.734886,1.065535


In [48]:
# 按值排序 sort_values(by ='')

df4.sort_values(by=2)

Unnamed: 0,0,2,0.1,1
1,-0.195859,-0.942873,-1.106329,-0.886807
2,-0.805194,1.065535,-0.105788,-0.734886
2,2.175211,1.483963,0.437963,1.199707


### 处理缺失数据

In [49]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,-1.139849,1.609428,1.230501
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [50]:
# isnull

df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [51]:
df_data.isnull().sum()

0    0
1    3
2    2
dtype: int64

In [52]:
# dropna

df_data.dropna()


Unnamed: 0,0,1,2
0,-1.139849,1.609428,1.230501


In [53]:
df_data.dropna(axis=1)

Unnamed: 0,0
0,-1.139849
1,1.0
2,4.0
3,1.0


In [54]:
# fillna

df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,-1.139849,1.609428,1.230501
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0
