In [1]:
import numpy as np
import pandas as pd

## 5.4.1 Series缺失数据

In [2]:
ser = pd.Series(['tom', 'jerry', np.nan, 'java'])
ser

0      tom
1    jerry
2      NaN
3     java
dtype: object

#### isnull()

In [3]:
ser.isnull()

0    False
1    False
2     True
3    False
dtype: bool

#### notnull()

In [4]:
ser.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [5]:
ser[0] = None  # None 也是NaN
ser

0     None
1    jerry
2      NaN
3     java
dtype: object

In [6]:
ser.isnull()

0     True
1    False
2     True
3    False
dtype: bool

## 5.4.2 滤除缺失数据

In [7]:
ser2 = pd.Series([1, np.nan, 3.5, np.nan, 7])
ser2

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

#### dropna() 默认不改变原始数据,设置参数inplace=True则直接修改源数据

In [8]:
ser2.dropna() #  ===ser2[ser2.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
ser2[ser2.notnull()] # === ser2.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
ser2

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [11]:
ser2.dropna(inplace=True)
ser2

0    1.0
2    3.5
4    7.0
dtype: float64

#### DataFrame缺失数据

In [12]:
df1 = pd.DataFrame([[1., 6.5, 3], [1., np.nan, np.nan], \
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])
df1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


#### 默认只返回整行都没有nan的行,可设置参数how='all'使返回不全为nan的行, 不修改源数据

In [13]:
df1.dropna() # 默认axis= 0 = index

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
df1.dropna(how='all') # 默认axis= 0 = index

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
# 原始数据不变
df1

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
# 添加一列
df1[4] = np.nan
df1

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [17]:
df1.dropna(axis=1, how='all') # axis='columns'， 全为nan的 列 才删掉

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
df2 = pd.DataFrame(np.random.randn(7, 3))
df2

Unnamed: 0,0,1,2
0,0.873235,0.899682,-1.31989
1,0.011706,0.336962,-0.926263
2,-1.262002,0.671462,-0.360914
3,1.836234,-0.314116,1.838837
4,-1.489758,-0.475633,-0.212156
5,-0.226253,-0.43223,-1.021947
6,0.718567,-0.938712,0.902325


In [19]:
# 修改数据
df2.iloc[:1, 0] = np.nan
df2.iloc[:4, 1] = np.nan
df2.iloc[:2, 2] = np.nan
df2

Unnamed: 0,0,1,2
0,,,
1,0.011706,,
2,-1.262002,,-0.360914
3,1.836234,,1.838837
4,-1.489758,-0.475633,-0.212156
5,-0.226253,-0.43223,-1.021947
6,0.718567,-0.938712,0.902325


In [20]:
# df.dropna(thresh=n)
# 这一行除去NA值，剩余数值的数量大于等于n，便显示这一行。

In [21]:
df2.dropna(thresh=1)

Unnamed: 0,0,1,2
1,0.011706,,
2,-1.262002,,-0.360914
3,1.836234,,1.838837
4,-1.489758,-0.475633,-0.212156
5,-0.226253,-0.43223,-1.021947
6,0.718567,-0.938712,0.902325


In [22]:
df2.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.262002,,-0.360914
3,1.836234,,1.838837
4,-1.489758,-0.475633,-0.212156
5,-0.226253,-0.43223,-1.021947
6,0.718567,-0.938712,0.902325


In [23]:
df2.dropna(thresh=3)

Unnamed: 0,0,1,2
4,-1.489758,-0.475633,-0.212156
5,-0.226253,-0.43223,-1.021947
6,0.718567,-0.938712,0.902325


## 5.4.3 填充缺失数据

In [24]:
df2 = pd.DataFrame([[1., 6.5, 3], [1., np.nan, np.nan], \
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])
df2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### fillna() 默认也不修改原始数据的内容,当然可以设置为在原数据上修改

In [25]:
df2.fillna(100)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,100.0,100.0
2,100.0,100.0,100.0
3,100.0,6.5,3.0


In [26]:
df2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [27]:
df2.fillna({0:0, 1:111, 2:222, 3:333}) # 不同的列的nan填充不同的值

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,111.0,222.0
2,0.0,111.0,222.0
3,0.0,6.5,3.0


In [28]:
df2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [29]:
res = df2.fillna({0:0, 1:111, 2:222, 3:333}, inplace=True)# 直接修改原始数据, 
# 一般直接修改原始数据的没有返回值，，这里有返回值，返回修改后的数据
res

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,111.0,222.0
2,0.0,111.0,222.0
3,0.0,6.5,3.0


In [30]:
df2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,111.0,222.0
2,0.0,111.0,222.0
3,0.0,6.5,3.0


### 使用reindex的填充方式

In [31]:
df = pd.DataFrame(np.random.randn(4, 3))
df.iloc[1:, 1] = np.nan
df.iloc[2:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.150496,0.005767,-0.496544
1,1.253928,,-0.024773
2,0.98228,,
3,-1.369085,,


In [32]:
df.fillna(method='ffill') # # 使用上一行的值填充

Unnamed: 0,0,1,2
0,-0.150496,0.005767,-0.496544
1,1.253928,0.005767,-0.024773
2,0.98228,0.005767,-0.024773
3,-1.369085,0.005767,-0.024773


In [33]:
df.fillna(method='ffill', limit=2) # # 使用上一行的值填充,但是只能填充两行

Unnamed: 0,0,1,2
0,-0.150496,0.005767,-0.496544
1,1.253928,0.005767,-0.024773
2,0.98228,0.005767,-0.024773
3,-1.369085,,-0.024773
