In [1]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

## Load data

You can download the Melborne Housing dataset from:  
https://www.kaggle.com/dansbecker/melbourne-housing-snapshot/home

In [3]:
df = pd.read_csv('melb_data.csv')
df = df.select_dtypes(exclude=['object']) # For simplicity, we only use numerical features. 
df.head()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,1465000.0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,850000.0,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,1600000.0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


## Define auxiliary functions to missing data imputation and scoring

In [5]:
# Function for comparing different approaches
def score_dataset(df):
    X = df.drop(['Price'], axis = 1)
    y = df.Price
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 123)
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    err = mean_absolute_error(y_test, preds)
    print(f"Mean Absolute Error: {err}") 
    return err
    
def impute_missing(df, label, imputer):
    X = df.drop([label], axis = 1)
    y = df[label]
    cols = X.columns
    X = imputer.fit_transform(X)
    X = pd.DataFrame(X, columns = cols)
    df_treat = pd.concat([X, y], axis = 1)
    return df_treat

## Demo handling missing data strategies

In [6]:
scores = {}

### 1. Listwise Deletion

In [7]:
# Drop Values
df_treat1 = df.dropna()
print("Deletion: Listwise Deletion")
scores['Listwise Deletion'] =  score_dataset(df_treat1)

Deletion: Listwise Deletion
Mean Absolute Error: 185466.69209370425


### 2. Drop Columns

In [8]:
df_treat2 = df.dropna(axis = 1)
print("Deletion: Drop Columns")
scores['Drop Columns'] = score_dataset(df_treat2)

Deletion: Drop Columns
Mean Absolute Error: 190224.76612841015


### 3. Impute Mean 

In [9]:
imp = SimpleImputer(strategy = 'mean')
df_treat3 = impute_missing(df, 'Price', imp)
print("Mean Imputation")
scores['Mean Imputation'] = score_dataset(pd.DataFrame(df_treat3, columns = df.columns))

Mean Imputation
Mean Absolute Error: 176065.0362665685


### 4. Impute Median

In [10]:
imp = SimpleImputer(strategy = 'median')
df_treat4 = impute_missing(df, 'Price', imp)
print("Median Imputation")
scores['Median Imputation'] = score_dataset(pd.DataFrame(df_treat4, columns = df.columns))

Median Imputation
Mean Absolute Error: 176453.97260677468


### 5. Impute Most Frequent

In [11]:
imp = SimpleImputer(strategy = 'most_frequent')
df_treat5 = impute_missing(df, 'Price', imp)
print("Most Frequent Imputation")
scores['Most Frequent Imputation'] = score_dataset(pd.DataFrame(df_treat5, columns = df.columns))

Most Frequent Imputation
Mean Absolute Error: 175345.81580756014


### 6. Machine Learning Model Imputation  -- KNN

In [12]:
imp = KNNImputer()
df_treat6 = impute_missing(df, 'Price', imp)
print("Machine Learning Model Imputation  -- KNN")
scores['KNN Imputation'] = score_dataset(pd.DataFrame(df_treat6, columns = df.columns))

Machine Learning Model Imputation  -- KNN
Mean Absolute Error: 182363.22547864506


### 7. Multiple Imputation -- MICE

In [13]:
imp = IterativeImputer()
df_treat7 = impute_missing(df, 'Price', imp)
print("Mutiple Imputation  -- MICE")
scores['MICE Imputation'] =  score_dataset(df_treat7)

Mutiple Imputation  -- MICE
Mean Absolute Error: 179970.0197717231


## Comparison and Summary

In [14]:
df_scores = pd.DataFrame.from_dict(scores, orient='index', columns = ['Mean Absolute Error'])
df_scores.sort_values(by = 'Mean Absolute Error')

Unnamed: 0,Mean Absolute Error
Most Frequent Imputation,175345.815808
Mean Imputation,176065.036267
Median Imputation,176453.972607
MICE Imputation,179970.019772
KNN Imputation,182363.225479
Listwise Deletion,185466.692094
Drop Columns,190224.766128
