-
Notifications
You must be signed in to change notification settings - Fork 0
/
handl_missing_value.py
69 lines (51 loc) · 2.76 KB
/
handl_missing_value.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
# Load data
melb_data = pd.read_csv('./data/melb_data.csv')
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
melb_target = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis=1)
# For the sake of keeping the example simple, we'll use only numeric predictors.
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors,
melb_target,
train_size=0.7,
test_size=0.3,
random_state=0)
def score_dataset(X_train, X_test, y_train, y_test):
model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)
return mean_absolute_error(y_test, preds)
# # delete columns with missing value
cols_with_missing = [col for col in X_train.columns
if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))
# # replace missing value with mean value in column with missing value
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(X_train) #first fit_transform,second transform,默认用均值填充缺失值
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))
# 通过添加缺失值的标识列,但在这个例子中效果不太佳
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()
cols_with_missing = (col for col in X_train.columns
if X_train[col].isnull().any())
for col in cols_with_missing:
imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)
print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))