In [None]:
import pandas as pd
import numpy as npm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("E:/Projects/Learning/ML/Melbourne Housing Data/melb_data.csv")
df.head()

In [4]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [6]:
df.shape

(13580, 21)

In [None]:
melb_predictors = df.drop("Price", axis=1)
X = melb_predictors.select_dtypes(exclude='object')
y = df.Price

train_X, valid_X, train_y, valid_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [19]:
def score_dataset(train_X, valid_X, train_y, valid_y):
    model = RandomForestRegressor(n_estimators=10, random_state=1)
    model.fit(train_X, train_y)
    prediction = model.predict(valid_X)
    mae = mean_absolute_error(prediction, valid_y)
    return mae

### Drop Columns with Missing Values

In [20]:
cols_with_missing = [col for col in train_X.columns if train_X[col].isna().any()]

reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_valid_X = valid_X.drop(cols_with_missing, axis=1)

print(f"MAE from dropping columns: {score_dataset(reduced_train_X, reduced_valid_X, train_y, valid_y)}")

MAE from dropping columns: 187082.57548478153


### Imputation

In [33]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy='mean')
# my_imputer.fit(train_X)
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_X_valid = pd.DataFrame(my_imputer.transform(valid_X))

imputed_X_train.columns = train_X.columns
imputed_X_valid.columns = valid_X.columns

print(f"MAE from imputation: {score_dataset(imputed_X_train, imputed_X_valid, train_y, valid_y)}")

MAE from imputation: 174917.69150711832


### Approach 3: an extension to imputation

In [46]:
train_X_plus = train_X.copy()
valid_X_plus = valid_X.copy()

for col in cols_with_missing:
    train_X_plus[col + '_was_missing'] = train_X_plus[col].isna()
    valid_X_plus[col + '_was_missing'] = valid_X_plus[col].isna()

my_imputer_2 = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer_2.fit_transform(train_X_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer_2.transform(valid_X_plus))

imputed_X_train_plus.columns = train_X_plus.columns
imputed_X_valid_plus.columns = valid_X_plus.columns

print(f"MAE from Approach 3: {score_dataset(imputed_X_train_plus, imputed_X_valid_plus, train_y, valid_y)}")

MAE from Approach 3: 177346.30320324007
