In [1]:
## 導入SimpleImputer套件
from sklearn.impute import SimpleImputer
## 導入NumPy套件
import numpy as np

## 構建數據集
x = [[1, 2, np.nan], [28, np.nan, 36], [np.nan, 58, 66]]
print('Datasets: ', x)

## 設定SimpleImputer
## 缺失值以均值填充
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')

## 缺失值以中位數填充
imp_median = SimpleImputer(missing_values = np.nan, strategy ='median')

## 缺失值以眾数填充
imp_most_frequent = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

## 缺失值以自定義的值填充
imp_constant = SimpleImputer(missing_values = np.nan, strategy = 'constant', fill_value = 100)

## 構建訓練集
train_set = [[2, 6, 7], [9, 7, 3], [np. nan, 68, 60]]
print('Training Set: ', train_set)

## 訓練
imp_mean.fit(train_set)
imp_median.fit(train_set)
imp_most_frequent.fit(train_set)
imp_constant.fit(train_set)


## 填補缺失值的結果
print('Mean:')
print(imp_mean.transform(x))

print('Most Frequent:')
print(imp_most_frequent.transform(x))

print('Median:')
print(imp_median.transform(x))

print('Constant:')
print(imp_constant.transform(x))


Datasets:  [[1, 2, nan], [28, nan, 36], [nan, 58, 66]]
Training Set:  [[2, 6, 7], [9, 7, 3], [nan, 68, 60]]
Mean:
[[ 1.          2.         23.33333333]
 [28.         27.         36.        ]
 [ 5.5        58.         66.        ]]
Most Frequent:
[[ 1.  2.  3.]
 [28.  6. 36.]
 [ 2. 58. 66.]]
Median:
[[ 1.   2.   7. ]
 [28.   7.  36. ]
 [ 5.5 58.  66. ]]
Constant:
[[  1.   2. 100.]
 [ 28. 100.  36.]
 [100.  58.  66.]]


In [2]:
print('Statistics: ', imp_mean.statistics_)
print('Indicator: ', imp_mean.indicator_)

Statistics:  [ 5.5        27.         23.33333333]
Indicator:  None


In [35]:
## 導入SimpleImputer套件
from sklearn.impute import SimpleImputer
## 導入NumPy套件
import numpy as np

## 構建數據集
x = [[1, 2, np.nan], [28, np.nan, 361], [np.nan, 58, 66]]
print('Datasets: ', x)

# 設定SimpleImputer
## 缺失值以均值填充
imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean', add_indicator = True, copy = True)

## 構建訓練集
train_set = [[2, 6, 7], [9, 7, 31], [np.nan, 68, 60]]
print('Training Set: ', train_set)

## 訓練
imp_mean.fit(train_set)

## 填補缺失值的結果
print('Mean:')
print(imp_mean.transform(x))

Datasets:  [[1, 2, nan], [28, nan, 361], [nan, 58, 66]]
Training Set:  [[2, 6, 7], [9, 7, 31], [nan, 68, 60]]
Mean:
[[  1.           2.          32.66666667   0.        ]
 [ 28.          27.         361.           0.        ]
 [  5.5         58.          66.           1.        ]]


In [78]:
## 導入套件
import numpy as np
import pandas as pd

## 構建數據集
dataset = pd. DataFrame([[1,2,3], [8,7,9], [np.nan, 28, 66], [2, 3, 8], [np.nan, 6, 8], [1, 10, 28]], columns = ['A', 'B', 'C'])

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,,28,66
3,2.0,3,8
4,,6,8
5,1.0,10,28


In [45]:
## 將缺失值填充為整數100
dataset['A'] = dataset['A'].fillna(100)

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,100.0,28,66
3,2.0,3,8
4,100.0,6,8
5,1.0,10,28


In [58]:
## 將缺失值填充為字符串Missing value
dataset['A'] = dataset['A'].fillna('Missing Value')

dataset

Unnamed: 0,A,B,C
0,1,2,3
1,8,7,9
2,Missing Value,28,66
3,2,3,8
4,Missing Value,6,8
5,1,10,28


In [60]:
## 用A列的均值填充
dataset['A'] = dataset['A'].fillna(dataset['A'].mean())

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,3.0,28,66
3,2.0,3,8
4,3.0,6,8
5,1.0,10,28


In [62]:
## 用A列的眾數填充
dataset['A'] = dataset['A'].fillna(dataset ['A'].mode()[0])

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,1.0,28,66
3,2.0,3,8
4,1.0,6,8
5,1.0,10,28


In [64]:
## 用前一筆數據填充
dataset['A'] = dataset['A'].fillna(method = 'pad')

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,8.0,28,66
3,2.0,3,8
4,2.0,6,8
5,1.0,10,28


In [66]:
## 用前一筆數據填充
dataset['A'] = dataset['A'].fillna(method = 'ffill')

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,8.0,28,66
3,2.0,3,8
4,2.0,6,8
5,1.0,10,28


In [79]:
## 用後一筆數據填充
dataset['A'] = dataset['A'].fillna(method = 'bfill')

dataset

Unnamed: 0,A,B,C
0,1.0,2,3
1,8.0,7,9
2,2.0,28,66
3,2.0,3,8
4,1.0,6,8
5,1.0,10,28
