識別數據表格中的遺漏值

In [14]:
import pandas as pd
from io import StringIO

csv_data = \
'''A,B,C,D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,,8.0
10.0, 11.0, 12.0,'''
df = pd.read_csv(StringIO(csv_data))
df


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [15]:
#利用is null 函數進行分析
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [16]:
#刪除具有遺漏值的樣本或特徵
#刪除具有"遺漏值"的列
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [17]:
#刪除具有"遺漏值"的行
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [18]:
#(returns the whole array here since we don't have a row with where all values are NaN)
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [19]:
#drop rows that have less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [20]:
#only drop rows where NaN appear in specific columns (here:'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [21]:
#填補遺漏值
from sklearn.preprocessing import Imputer
#axis=0:計算列的平均值；axis=1:計算行的平均值
#strategy的其他選項:median, most_frequent
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
#fit方法被用來從"訓練數據集"中做參數學習，transform方法利用這樣的參數做"數據轉換"
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])