In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
melb = pd.read_csv('melb_data.csv')
melb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
# Separate the target from the features

y = melb.Price

X = melb.drop(['Price'], axis=1)

In [4]:
# A list of numerical columns with no missing entries

numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] and X[col].isnull().sum() == 0]

numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [5]:
num_X = X[numerical_cols]

num_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
0,2,2.5,3067,2,1,202,-37.7996,144.9984,4019
1,2,2.5,3067,2,1,156,-37.8079,144.9934,4019
2,3,2.5,3067,3,2,134,-37.8093,144.9944,4019
3,3,2.5,3067,3,2,94,-37.7969,144.9969,4019
4,4,2.5,3067,3,1,120,-37.8072,144.9941,4019


In [6]:
# A list of categorical columns with low cardinality

categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

categorical_cols

['Type', 'Method', 'Regionname']

In [7]:
categ_X = X[categorical_cols]

categ_X.head()

Unnamed: 0,Type,Method,Regionname
0,h,S,Northern Metropolitan
1,h,S,Northern Metropolitan
2,h,SP,Northern Metropolitan
3,h,PI,Northern Metropolitan
4,h,VB,Northern Metropolitan


In [8]:
# The features to use
# Only the numerical and categorical columns selected

my_cols = numerical_cols + categorical_cols

new_X = X[my_cols]

new_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2,2.5,3067,2,1,202,-37.7996,144.9984,4019,h,S,Northern Metropolitan
1,2,2.5,3067,2,1,156,-37.8079,144.9934,4019,h,S,Northern Metropolitan
2,3,2.5,3067,3,2,134,-37.8093,144.9944,4019,h,SP,Northern Metropolitan
3,3,2.5,3067,3,2,94,-37.7969,144.9969,4019,h,PI,Northern Metropolitan
4,4,2.5,3067,3,1,120,-37.8072,144.9941,4019,h,VB,Northern Metropolitan


In [9]:
print('Types: ', X['Type'].unique())

Types:  ['h' 'u' 't']


In [10]:
print('Methods: ', X['Method'].unique())

Methods:  ['S' 'SP' 'PI' 'VB' 'SA']


In [11]:
print('Regions: ', X['Regionname'].unique())

Regions:  ['Northern Metropolitan' 'Western Metropolitan' 'Southern Metropolitan'
 'Eastern Metropolitan' 'South-Eastern Metropolitan' 'Eastern Victoria'
 'Northern Victoria' 'Western Victoria']


In [12]:
print('Regions: ', X['Regionname'].nunique())

Regions:  8


In [13]:
new_X[categorical_cols].nunique()

Type          3
Method        5
Regionname    8
dtype: int64

In [14]:
# Split the training and validation sets

X_train, X_valid, y_train, y_valid = train_test_split(new_X, y, test_size=0.2, random_state=11)

In [15]:
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12293,4,7.5,3040,4,1,478,-37.74424,144.89149,9264,h,S,Western Metropolitan
3698,3,7.5,3102,3,1,378,-37.7956,145.0525,2671,h,S,Southern Metropolitan
2912,5,13.0,3046,5,3,612,-37.7022,144.9054,8870,h,PI,Northern Metropolitan
4997,2,8.8,3072,2,1,266,-37.7423,145.0105,14577,h,S,Northern Metropolitan
12356,4,6.2,3039,4,2,465,-37.76306,144.92851,6232,h,S,Western Metropolitan


In [16]:
X_train.shape

(10864, 12)

In [17]:
X_valid.shape

(2716, 12)

In [18]:
# Apply the encoding to the categorical cols of both the training and validation sets

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Dataframe

OH_enc_train = pd.DataFrame(encoder.fit_transform(X_train[categorical_cols]))
OH_enc_valid = pd.DataFrame(encoder.transform(X_valid[categorical_cols]))

In [19]:
OH_enc_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
# One Hot Encoding alters the index values
# Put them back from the original training and validation sets

OH_enc_train.index = X_train.index
OH_enc_valid.index = X_valid.index

OH_enc_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
12293,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3698,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2912,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4997,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12356,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
# The encoding did not mess with the numerical columns

num_X_train = X_train[numerical_cols]
num_X_valid = X_valid[numerical_cols]

num_X_valid.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
11750,3,18.8,3170,3,2,246,-37.92829,145.19647,7113
3794,3,7.4,3144,3,3,283,-37.858,145.0351,4675
9429,3,13.8,3165,3,1,650,-37.91905,145.05597,10969
1614,3,7.8,3124,3,1,692,-37.8379,145.0849,8920
2559,4,3.5,3068,4,2,266,-37.7797,144.9861,6244


In [22]:
# Now concatenate the numerical columns with the encoded train and validation sets

new_X_train = pd.concat([num_X_train, OH_enc_train], axis=1)
new_X_valid = pd.concat([num_X_valid, OH_enc_valid], axis=1)


new_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,0,...,6,7,8,9,10,11,12,13,14,15
12293,4,7.5,3040,4,1,478,-37.74424,144.89149,9264,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3698,3,7.5,3102,3,1,378,-37.7956,145.0525,2671,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2912,5,13.0,3046,5,3,612,-37.7022,144.9054,8870,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4997,2,8.8,3072,2,1,266,-37.7423,145.0105,14577,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12356,4,6.2,3039,4,2,465,-37.76306,144.92851,6232,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
new_X_valid.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,0,...,6,7,8,9,10,11,12,13,14,15
11750,3,18.8,3170,3,2,246,-37.92829,145.19647,7113,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3794,3,7.4,3144,3,3,283,-37.858,145.0351,4675,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9429,3,13.8,3165,3,1,650,-37.91905,145.05597,10969,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1614,3,7.8,3124,3,1,692,-37.8379,145.0849,8920,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2559,4,3.5,3068,4,2,266,-37.7797,144.9861,6244,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [24]:
y_train.head()

12293     900000
3698     1300000
2912      705000
4997      790000
12356    1450000
Name: Price, dtype: int64

In [25]:
# Now the model

model = RandomForestRegressor(random_state=0)

model.fit(new_X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [26]:
# Make predictions and evaluate the model

predictions = model.predict(new_X_valid)

mae = mean_absolute_error(predictions, y_valid)

mae

168039.183707483

In [27]:
# The function to determine the best value for n_estimators

def get_mae(estimators, new_X_train, new_X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=estimators, random_state=11)
    model.fit(new_X_train, y_train)
    predictions = model.predict(new_X_valid)
    mae = mean_absolute_error(predictions, y_valid)
    return mae

In [28]:
# The loop to try out different values 

estimators = [200, 370, 500, 750]

for estimators in estimators:
    the_mae = get_mae(estimators, new_X_train, new_X_valid, y_train, y_valid)
    print('The estimators used: {} \t\t\t\t The resulting MAE: {}'.format(estimators, the_mae))

The estimators used: 200 				 The resulting MAE: 168826.4503936987
The estimators used: 370 				 The resulting MAE: 168654.51564933857
The estimators used: 500 				 The resulting MAE: 168557.1723138018
The estimators used: 750 				 The resulting MAE: 168408.21283253853
