In [None]:
import numpy as np
import pandas as pd
from numpy import nan as NA
'''
axis : {0 or 'index', 1 or 'columns'}, default 0
    Determine if rows or columns which contain missing values are
    removed.

    * 0, or 'index' : Drop rows which contain missing values.
    * 1, or 'columns' : Drop columns which contain missing value.

how : {'any', 'all'}, default 'any'
    Determine if row or column is removed from DataFrame, when we have
    at least one NA or all NA.

    * 'any' : If any NA values are present, drop that row or column.
    * 'all' : If all values are NA, drop that row or column.

thresh : int, optional
    Require that many non-NA values.
    
subset : array-like, optional
    Labels along other axis to consider, e.g. if you are dropping rows
    these would be a list of columns to include.
    
inplace : bool, default False
    If True, do operation inplace and return None.
'''
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

In [2]:
data[data.notnull()] # 与上等价

0    1.0
2    3.5
4    7.0
dtype: float64

In [3]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # 默认how='any'

In [4]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [5]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [6]:
data.dropna(how='all')  # 如果所有值都是NA,则删除该行或列(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [7]:
data.dropna(how='all', axis=1) # 指定axis=1,删除包含缺失值的列

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.338764,,
1,-0.799665,,
2,-3.283618,,0.878065
3,0.535663,,1.240623
4,-0.209812,0.998628,1.012293
5,0.555809,-0.792304,-0.290616
6,-0.916538,0.025919,-0.196379


In [21]:
df.dropna(thresh=2)     # 仅保留至少包含2个非NA值的行

Unnamed: 0,0,1,2
2,-3.283618,,0.878065
3,0.535663,,1.240623
4,-0.209812,0.998628,1.012293
5,0.555809,-0.792304,-0.290616
6,-0.916538,0.025919,-0.196379


In [22]:
df.dropna(axis=1, thresh=4) # 仅保留至少包含4个非NA值的列

Unnamed: 0,0,2
0,-0.338764,
1,-0.799665,
2,-3.283618,0.878065
3,0.535663,1.240623
4,-0.209812,1.012293
5,0.555809,-0.290616
6,-0.916538,-0.196379


In [23]:
df.dropna(subset=[0, 2]) # Define in which columns to look for missing values

Unnamed: 0,0,1,2
2,-3.283618,,0.878065
3,0.535663,,1.240623
4,-0.209812,0.998628,1.012293
5,0.555809,-0.792304,-0.290616
6,-0.916538,0.025919,-0.196379
