In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter("ignore")

In [2]:
dfMlb = pd.read_csv('house_prices.csv')

y = dfMlb.loc[:,['Price']]
X = dfMlb.drop(['Price'], axis=1)

# split data to test/train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Build Random Forest Model

In [3]:
from sklearn.ensemble import RandomForestRegressor

def build_score_RandomForest(X_trn, X_tst, y_trn, y_tst):
    
    
    mdlRfsMlb = RandomForestRegressor(random_state=1)
    mdlRfsMlb.fit(X_trn, y_trn)
    y_tst_pred = mdlRfsMlb.predict(X_tst)
    mae = mean_absolute_error(y_tst, y_tst_pred)
    
    
    return (mae)

In [4]:
build_score_RandomForest(X_train, X_test, y_train, y_test)

ValueError: could not convert string to float: 'Brighton'

## Numerical Features

In [5]:
X.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [6]:
colsNum = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

In [7]:
colsNum

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [8]:
Xnum = X[colsNum]

In [9]:
Xnum.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.7996,144.9984,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.7969,144.9969,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [10]:
Xnum_train, Xnum_test, y_train, y_test = train_test_split(Xnum, y, test_size=0.2, random_state=1)

build_score_RandomForest(Xnum_train, Xnum_test, y_train, y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## How do we deal with missing data?

#### Approach 1: remove columns that have any null values

In [11]:
cols_num_null = [col for col in Xnum.columns if Xnum[col].isnull().any()]

In [12]:
Xnum_train_drpnull = Xnum_train.drop(cols_num_null, axis=1)
Xnum_test_drpnull = Xnum_test.drop(cols_num_null, axis=1)

In [13]:
print('MAE from Approach 1. (Drop features with null values):')
build_score_RandomForest(Xnum_train_drpnull, Xnum_test_drpnull, y_train, y_test)

MAE from Approach 1. (Drop features with null values):


187082.57548478153

#### Approach 2: fill missing values by imputation

In [14]:
# Replace null with forward fill
Xnum_train_ffill = Xnum_train.fillna(method='ffill')
Xnum_test_ffill = Xnum_test.fillna(method='ffill')

print('MAE from Approach 2. (Replace values with forward fill):')
build_score_RandomForest(Xnum_train_ffill, Xnum_test_ffill, y_train, y_test)

MAE from Approach 2. (Replace values with forward fill):


184138.79064801178

In [15]:
# Replace null with 0
Xnum_train_repnull = Xnum_train.fillna(0)
Xnum_test_repnull = Xnum_test.fillna(0)

print('MAE from Approach 2. (Replace values with 0):')
build_score_RandomForest(Xnum_train_repnull, Xnum_test_repnull, y_train, y_test)

MAE from Approach 2. (Replace values with 0):


176862.03270741284

In [16]:
# Replace null with mean
Xnum_train_repnull_mean = Xnum_train.fillna(Xnum_train.mean())
Xnum_test_repnull_mean = Xnum_test.fillna(Xnum_train.mean())

print('MAE from Approach 2. (Replace values with mean):')
build_score_RandomForest(Xnum_train_repnull_mean, Xnum_test_repnull_mean, y_train, y_test)

MAE from Approach 2. (Replace values with mean):


174917.69150711832

In [17]:
# Going forward, let us replace all missing values with the column mean in the original data

X_train[colsNum] = Xnum_train_repnull_mean[colsNum]
X_test[colsNum] = Xnum_test_repnull_mean[colsNum]

In [18]:
X_train.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1041,Brighton,18 Rooding St,3,h,S,Buxton,3/09/2016,11.2,3186.0,3.0,1.0,2.0,366.0,156.0,1920.0,Bayside,-37.9038,145.0001,Southern Metropolitan,10579.0
1989,Coburg,11 Station St,3,h,S,Brad,23/04/2016,7.8,3058.0,3.0,1.0,0.0,238.0,131.0,1900.0,Moreland,-37.7539,144.9612,Northern Metropolitan,11204.0
10157,Brunswick,106 Evans St,3,h,S,Ray,27/05/2017,5.2,3056.0,3.0,1.0,1.0,439.0,153.363129,1964.948871,Moreland,-37.77047,144.97005,Northern Metropolitan,11918.0
1711,Carnegie,4/5 Anzac St,2,u,S,hockingstuart,4/06/2016,11.4,3163.0,2.0,1.0,2.0,0.0,100.0,1973.0,Glen Eira,-37.8863,145.066,Southern Metropolitan,7822.0
11565,Altona,64 Queen St,4,h,S,Greg,22/07/2017,11.0,3018.0,4.0,2.0,4.0,615.0,153.363129,1964.948871,Hobsons Bay,-37.87057,144.83623,Western Metropolitan,5301.0


## Non-numeric Features

In [19]:
X.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [20]:
colsObj = [col for col in X.columns if X[col].dtype in ['object']]
colsObj

['Suburb',
 'Address',
 'Type',
 'Method',
 'SellerG',
 'Date',
 'CouncilArea',
 'Regionname']

#### We only really want ones that have < 10 values, otherwise they aren't categorical

In [23]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Initialize into new df's
Xle_train = X_train.copy()
Xle_test = X_train.copy()

# Fill out new df with transform
for col in colsObj:
    Xle_train[col] = le.fit_transform(X_train[col])
    Xle_test[col] = le.transform(X_test[col])


ValueError: y contains previously unseen labels: 'Beaconsfield Upper'

In [24]:
X[colsObj].nunique()

Suburb           314
Address        13378
Type               3
Method             5
SellerG          268
Date              58
CouncilArea       33
Regionname         8
dtype: int64

In [26]:
colsCat = [col for col in X.columns if X[col].dtype in ['object'] and X[col].nunique() < 10]
colsCat

['Type', 'Method', 'Regionname']

In [30]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Initialize into new df's
Xle_train = X_train.copy()
Xle_test = X_test.copy()

# Fill out new df with transform
for col in colsCat:
    Xle_train[col] = le.fit_transform(X_train[col])
    Xle_test[col] = le.transform(X_test[col])

In [33]:
print('MAE from Label Encoding Categorical columns:')
build_score_RandomForest(Xle_train[colsNum+colsCat], Xle_test[colsNum+colsCat], y_train, y_test)

MAE from Label Encoding Categorical columns:


166446.634413353

In [40]:
# Build Gradient Boosted Tree
from sklearn.ensemble import GradientBoostingRegressor    
    
mdlGbrMlb = GradientBoostingRegressor(random_state=1)
mdlGbrMlb.fit(Xle_train[colsNum+colsCat], y_train)
y_test_pred = mdlGbrMlb.predict(Xle_test[colsNum+colsCat])
mae = mean_absolute_error(y_test, y_test_pred)

mae

186379.17775285649

In [41]:
# Tune Grad. Boost. Mdl

mdlGbrMlb = GradientBoostingRegressor(random_state=1, n_estimators=5000, learning_rate=0.01, max_depth=5)
mdlGbrMlb.fit(Xle_train[colsNum+colsCat], y_train)
y_test_pred = mdlGbrMlb.predict(Xle_test[colsNum+colsCat])
mae = mean_absolute_error(y_test, y_test_pred)

mae

152846.02138089956

## Build Extreme Grad. Boost. mdl

In [42]:
from xgboost import XGBRegressor

mdlXgbMlb = XGBRegressor(random_state=1, n_estimators=5000, learning_rate=0.01, max_depth=5)
mdlXgbMlb.fit(Xle_train[colsNum+colsCat], y_train)
y_test_pred = mdlGbrMlb.predict(Xle_test[colsNum+colsCat])
mae = mean_absolute_error(y_test, y_test_pred)

ModuleNotFoundError: No module named 'xgboost'