## 四、缺失值处理

**1、发现缺失值**

In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame(np.array([[1, np.nan, 2],
                              [np.nan, 3, 4],
                              [5, 6, None]]), columns=["A", "B", "C"])
data

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,3.0,4.0
2,5.0,6.0,


**注意：有None、字符串等，数据类型全部变为object，它比int和float更消耗资源**

In [2]:
data.dtypes

A    object
B    object
C    object
dtype: object

In [3]:
data.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,True,False,False
2,False,False,True


In [4]:
data.notnull()

Unnamed: 0,A,B,C
0,True,False,True
1,False,True,True
2,True,True,False


**2、删除缺失值**

In [6]:
data = pd.DataFrame(np.array([[1, np.nan, 2, 3],
                              [np.nan, 4, 5, 6],
                              [7, 8, np.nan, 9],
                              [10, 11 , 12, 13]]), columns=["A", "B", "C", "D"])
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,3.0
1,,4.0,5.0,6.0
2,7.0,8.0,,9.0
3,10.0,11.0,12.0,13.0


**注意：np.nan是一种特殊的浮点数**

In [7]:
data.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

**（1）删除整行**

In [8]:
data.dropna()

Unnamed: 0,A,B,C,D
3,10.0,11.0,12.0,13.0


**（2）删除整列**

In [9]:
data.dropna(axis="columns")

Unnamed: 0,D
0,3.0
1,6.0
2,9.0
3,13.0


In [10]:
data["D"] = np.nan
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,4.0,5.0,
2,7.0,8.0,,
3,10.0,11.0,12.0,


In [13]:
data.dropna(axis="columns", how="all")

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,4.0,5.0
2,7.0,8.0,
3,10.0,11.0,12.0


In [14]:
data.dropna(axis="columns", how="any")

0
1
2
3


In [15]:
data.loc[3] = np.nan
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,4.0,5.0,
2,7.0,8.0,,
3,,,,


In [16]:
data.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,4.0,5.0,
2,7.0,8.0,,


**3、填充缺失值**

In [18]:
data = pd.DataFrame(np.array([[1, np.nan, 2, 3],
                              [np.nan, 4, 5, 6],
                              [7, 8, np.nan, 9],
                              [10, 11 , 12, 13]]), columns=["A", "B", "C", "D"])
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,3.0
1,,4.0,5.0,6.0
2,7.0,8.0,,9.0
3,10.0,11.0,12.0,13.0


In [19]:
data.fillna(value=5)

Unnamed: 0,A,B,C,D
0,1.0,5.0,2.0,3.0
1,5.0,4.0,5.0,6.0
2,7.0,8.0,5.0,9.0
3,10.0,11.0,12.0,13.0


* 用均值进行替换

In [21]:
fill = data.mean()
fill

A    6.000000
B    7.666667
C    6.333333
D    7.750000
dtype: float64

In [22]:
data.fillna(value=fill)

Unnamed: 0,A,B,C,D
0,1.0,7.666667,2.0,3.0
1,6.0,4.0,5.0,6.0
2,7.0,8.0,6.333333,9.0
3,10.0,11.0,12.0,13.0


In [27]:
fill = data.stack().mean()
fill

7.0

In [28]:
data.fillna(value=fill)

Unnamed: 0,A,B,C,D
0,1.0,7.0,2.0,3.0
1,7.0,4.0,5.0,6.0
2,7.0,8.0,7.0,9.0
3,10.0,11.0,12.0,13.0
