In [1]:
# In the event if there is missing value in the DataFrame, there are 3 ways to deal with it
# 1) Drop Columns with Missing Values
# 2) Imputation
# 3) Extension to Imputation

# This tutorial will use Melbourne Housing Data for demonostration

import pandas as pd

In [3]:
# Load Data
melb_data = pd.read_csv('./Housing Price ML Project/melb_data.csv/melb_data.csv')

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

melb_target = melb_data.Price
melb_predictors = melb_data.drop(['Price'], axis=1)

# set prediction target. Common convention, y is use to refer to prediction target
y = melb_target
# let x be the value of the predictors
x = melb_predictors

# To keep the example simple, use only numeric predictors.
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])

x = melb_numeric_predictors

In [27]:
# split data into training and test set. sklearn has library to automatically to do that
from sklearn.model_selection import train_test_split
# The split is based on random number generator.
train_X, val_X, train_y, val_y = train_test_split(x, y, random_state = 0)

In [40]:
def score_dataset (reduced_X_train, reduced_X_test, train_y, val_y):
    forest_model= RandomForestRegressor()
    forest_model.fit(reduced_X_train, train_y)
    melb_preds = forest_model.predict(reduced_X_test)
    return mean_absolute_error(val_y, melb_preds)

In [45]:
# Get Model Score from Dropping Columns with Missing Values
cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]
reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_test = val_X.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with missing value")
print(score_dataset(reduced_X_train, reduced_X_test, train_y, val_y))

Mean Absolute Error from dropping columns with missing value
349628.7885409872


In [46]:
# Get Model Score from Imputation
from sklearn.preprocessing import Imputer

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(train_X)
imputed_X_test = my_imputer.transform(val_X)
print("Mean Absolute Error from Imputation: ")
print(score_dataset(imputed_X_train, imputed_X_test, train_y, val_y))

Mean Absolute Error from Imputation: 
204648.46610132637


In [52]:
# Get Score from Imputation with Extra Columns Showing What Was Imputed

imputed_X_train_plus = train_X.copy()
imputed_X_test_plus = val_X.copy()

cols_with_missing = (col for col in train_X.columns if train_X[col].isnull().any())

for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
    
# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What was Imputed: ")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, train_y, val_y))

Mean Absolute Error from Imputation while Track What was Imputed: 
202816.82635355514


In [None]:
# Reference: https://www.kaggle.com/dansbecker/handling-missing-values