<a href="https://colab.research.google.com/github/ethanpnguyen/ds4e/blob/main/notebooks/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## House Prices:

Can you leverage more columns in the dataset for better predictions?

## Initialize

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## Load Data

In [3]:
dfMlb = pd.read_csv('/content/house_prices.csv')

In [4]:
dfMlb.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [5]:
# Select features and target

X = dfMlb.drop(['Price'], axis=1)
y = dfMlb.loc[:, 'Price']

In [6]:
# Select only numeric

cols_num = [col for col in X.columns if X[col].dtype in ['int64','float64']]
Xnum = X[cols_num]
Xnum.head()

Xnum_train, Xnum_test, y_train, y_test = train_test_split(Xnum,y, test_size=0.2,random_state=1)

In [7]:
Xnum_train.isna().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                52
Landsize            0
BuildingArea     5193
YearBuilt        4312
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

## Build Random Forest Model

In [8]:
from sklearn.ensemble import RandomForestRegressor

def get_random_forest_mae(X_trn,X_tst,y_trn,y_tst):
  mdlRfsMlb = RandomForestRegressor(random_state=1)
  mdlRfsMlb.fit(X_trn, y_trn)
  y_tst_prd = mdlRfsMlb.predict(X_tst)
  mae = mean_absolute_error(y_tst, y_tst_prd)
  return(mae)

In [9]:
## Approach 1: Drop columns with missing values

cols_num_null = [col for col in Xnum.columns if Xnum[col].isna().any()]
Xnum_train_drpnull = Xnum_train.drop(cols_num_null, axis=1)
Xnum_test_drpnull = Xnum_test.drop(cols_num_null, axis=1)

print('MAE from Approach 1 (Drop columns with missing values):')
print(get_random_forest_mae(Xnum_train_drpnull, Xnum_test_drpnull, y_train, y_test))

MAE from Approach 1 (Drop columns with missing values):
176556.1092096132


In [10]:
## Approach 2: Fill missing values with 0

Xnum_train_repnull = Xnum_train.fillna(0)
Xnum_test_repnull = Xnum_test.fillna(0)

print('MAE from Approach 2 (Fill missing values with 0):')
print(get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test))

MAE from Approach 2 (Fill missing values with 0):
167656.98217318885


In [11]:
## Approach 3: Fill missing values with mean value

Xnum_train_repnull = Xnum_train.fillna(Xnum_train.mean())
Xnum_test_repnull = Xnum_test.fillna(Xnum_train.mean())

print('MAE from Approach 3 (Fill missing values with mean):')
print(get_random_forest_mae(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test))

MAE from Approach 3 (Fill missing values with mean):
166170.5766405428


In [13]:
# Going forward, let us replace all missing numeric values with the column mean

Xnum_train[cols_num] = Xnum_train_repnull[cols_num]
Xnum_test[cols_num] = Xnum_test_repnull[cols_num]