In [2]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
# Load data
dfMlb = pd.read_csv('house_prices.csv')

# Select Target
y = dfMlb.loc[:,['Price']]

# Select Features
X = dfMlb.loc[:,['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']]

# Split Features into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Build Decision Tree Model

In [3]:
# Define
from sklearn.tree import DecisionTreeRegressor
mdlDtrMlb = DecisionTreeRegressor(random_state=1, max_leaf_nodes=50)

# Fit
mdlDtrMlb.fit(X_train, y_train)

# Predict
y_test_pred = mdlDtrMlb.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_test_pred)

In [4]:
mae

265943.7314522826

In [5]:
# Create a function to calculate the mae
def get_mae_dt(mx_lf_nds, X_trn, y_trn, X_tst, y_tst):
    mdlDtrMlb = DecisionTreeRegressor(random_state=1, max_leaf_nodes=mx_lf_nds)
    mdlDtrMlb.fit(X_train, y_train)
    y_test_pred = mdlDtrMlb.predict(X_test)
    mae = mean_absolute_error(y_test, y_test_pred)
    return(mae)

In [6]:
get_mae_dt(50, X_train, y_train, X_test, y_test)

265943.7314522826

In [7]:
# Python3 code to iterate over a list 

   
# Using for loop 
for i in [5, 50, 500, 5000]: 
    mae = get_mae_dt(i, X_train, y_train, X_test, y_test)
    print('Max leaf nodes:', i, '\t\t Mean Absolute Error:', mae)

Max leaf nodes: 5 		 Mean Absolute Error: 359783.60063327465
Max leaf nodes: 50 		 Mean Absolute Error: 265943.7314522826
Max leaf nodes: 500 		 Mean Absolute Error: 221352.9439085965
Max leaf nodes: 5000 		 Mean Absolute Error: 238098.78862204644


## Build Random Forest Model

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Define
mdlRfsMlb = RandomForestRegressor(random_state=1, max_leaf_nodes=5000)
# Fit
mdlRfsMlb.fit(X_train, y_train)

# Predict
y_test_pred = mdlRfsMlb.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_test_pred)

  


In [9]:
print('MAE (Random Forest):', mae)

MAE (Random Forest): 190944.50214587816


In [10]:
print('MAE (%):', mae/np.average(y))

MAE (%): 0.1775098337818727


## Try to reduce MAE by changing max leaf nodes (optimal @ 5000)

In [11]:
# Create a function to calculate the mae
def get_mae_rf(mx_lf_nds, X_trn, y_trn, X_tst, y_tst):
    mdlRfsMlb = RandomForestRegressor(random_state=1, max_leaf_nodes=mx_lf_nds)
    mdlRfsMlb.fit(X_train, y_train)
    y_test_pred = mdlRfsMlb.predict(X_test)
    mae = mean_absolute_error(y_test, y_test_pred)
    return(mae)

In [12]:
# Python3 code to iterate over a list 

   
# Using for loop 
for i in [1000, 5000, 10000]: 
    mae = get_mae_rf(i, X_train, y_train, X_test, y_test)
    print('Max leaf nodes:', i, '\t\t Mean Absolute Error:', mae)

  after removing the cwd from sys.path.


Max leaf nodes: 1000 		 Mean Absolute Error: 192084.07750987387


  after removing the cwd from sys.path.


Max leaf nodes: 5000 		 Mean Absolute Error: 190944.50214587816


  after removing the cwd from sys.path.


Max leaf nodes: 10000 		 Mean Absolute Error: 190989.15444368467


In [13]:
mae/np.average(y)

0.1775513443876903

### Error is around 18% still, we want to get it to 15%

## Try finding optimal number of estimators

In [14]:
# Create a function to calculate the mae
def get_mae_rf_est(num, X_trn, y_trn, X_tst, y_tst):
    mdlRfsMlb = RandomForestRegressor(random_state=1, n_estimators=num)
    mdlRfsMlb.fit(X_train, y_train)
    y_test_pred = mdlRfsMlb.predict(X_test)
    mae = mean_absolute_error(y_test, y_test_pred)
    return(mae)

In [15]:
# Python3 code to iterate over a list 

   
# Using for loop 
for i in [1, 10, 100, 1000]: 
    mae = get_mae_rf_est(i, X_train, y_train, X_test, y_test)
    print('Number of estimators:', i, '\t\t Mean Absolute Error:', mae)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Number of estimators: 1 		 Mean Absolute Error: 255658.93348060874
Number of estimators: 10 		 Mean Absolute Error: 193409.95611105266


  after removing the cwd from sys.path.


Number of estimators: 100 		 Mean Absolute Error: 182072.2203871561


  after removing the cwd from sys.path.


Number of estimators: 1000 		 Mean Absolute Error: 181251.80437312275


In [16]:
print('MAE (%):', mae/np.average(y))

MAE (%): 0.168499104741739


### only a slight improvement to 17%

## I found a method online to adjust a bunch of parameters at once and evaluate:

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [21]:
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = mdlRfsMlb, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  4.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=5000,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=10, n_jobs=None,
                                                   oob_score=False,
                                                   random_state=1...


In [22]:
rf_random.best_params_

{'n_estimators': 1200,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': True}

In [23]:
# Predict
y_test_pred = rf_random.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_test_pred)

In [25]:
print('MAE (%):', mae/np.average(y))

MAE (%): 0.16875342850080072


### That took forever to run and actually diminished our results.

### Our model now takes 1200 decision tree regressors to create the random forest regressor and has a few rules, but we are still unable to reduce the error to 15%. I bet there is more we could do to the data before throwing models at it and seeing how it responds (i.e. cleaning or using non-numerical data to improve models)

In [30]:
dfMlb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
