In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
melb = pd.read_csv('melb_data.csv')
melb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
melb.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [4]:
y = melb.Price

X = melb.drop(['Price'], axis=1)

In [5]:
# A list of numerical features with no missing values
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64'] and X[col].isnull().sum() == 0]
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [6]:
# A dataframe of all the object columns

obj_cols = X.select_dtypes('object')
obj_cols.head()

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,CouncilArea,Regionname
0,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Yarra,Northern Metropolitan
1,Abbotsford,25 Bloomburg St,h,S,Biggin,4/2/2016,Yarra,Northern Metropolitan
2,Abbotsford,5 Charles St,h,SP,Biggin,4/3/2017,Yarra,Northern Metropolitan
3,Abbotsford,40 Federation La,h,PI,Biggin,4/3/2017,Yarra,Northern Metropolitan
4,Abbotsford,55a Park St,h,VB,Nelson,4/6/2016,Yarra,Northern Metropolitan


In [7]:
# Reviewing to select those with low cardinality
obj_cols.nunique()

Suburb           314
Address        13378
Type               3
Method             5
SellerG          268
Date              58
CouncilArea       33
Regionname         8
dtype: int64

In [8]:
# A list of the selected object columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]
categorical_cols

['Type', 'Method', 'Regionname']

In [10]:
my_cols = numerical_cols + categorical_cols
X = X[my_cols]

X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2,2.5,3067,2,1,202,-37.7996,144.9984,4019,h,S,Northern Metropolitan
1,2,2.5,3067,2,1,156,-37.8079,144.9934,4019,h,S,Northern Metropolitan
2,3,2.5,3067,3,2,134,-37.8093,144.9944,4019,h,SP,Northern Metropolitan
3,3,2.5,3067,3,2,94,-37.7969,144.9969,4019,h,PI,Northern Metropolitan
4,4,2.5,3067,3,1,120,-37.8072,144.9941,4019,h,VB,Northern Metropolitan


In [11]:
# So as not to mess with the validation set 
# Split the data

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

In [12]:
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12293,4,7.5,3040,4,1,478,-37.74424,144.89149,9264,h,S,Western Metropolitan
3698,3,7.5,3102,3,1,378,-37.7956,145.0525,2671,h,S,Southern Metropolitan
2912,5,13.0,3046,5,3,612,-37.7022,144.9054,8870,h,PI,Northern Metropolitan
4997,2,8.8,3072,2,1,266,-37.7423,145.0105,14577,h,S,Northern Metropolitan
12356,4,6.2,3039,4,2,465,-37.76306,144.92851,6232,h,S,Western Metropolitan


In [13]:
encoder = LabelEncoder()

enc_X_train = X_train.copy()
enc_X_valid = X_valid.copy()

for col in categorical_cols:
    enc_X_train[col] = encoder.fit_transform(X_train[col])
    enc_X_valid[col] = encoder.transform(X_valid[col])

In [14]:
X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12293,4,7.5,3040,4,1,478,-37.74424,144.89149,9264,h,S,Western Metropolitan
3698,3,7.5,3102,3,1,378,-37.7956,145.0525,2671,h,S,Southern Metropolitan
2912,5,13.0,3046,5,3,612,-37.7022,144.9054,8870,h,PI,Northern Metropolitan
4997,2,8.8,3072,2,1,266,-37.7423,145.0105,14577,h,S,Northern Metropolitan
12356,4,6.2,3039,4,2,465,-37.76306,144.92851,6232,h,S,Western Metropolitan


In [15]:
enc_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
12293,4,7.5,3040,4,1,478,-37.74424,144.89149,9264,0,1,6
3698,3,7.5,3102,3,1,378,-37.7956,145.0525,2671,0,1,5
2912,5,13.0,3046,5,3,612,-37.7022,144.9054,8870,0,0,2
4997,2,8.8,3072,2,1,266,-37.7423,145.0105,14577,0,1,2
12356,4,6.2,3039,4,2,465,-37.76306,144.92851,6232,0,1,6


In [16]:
enc_X_valid.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
11750,3,18.8,3170,3,2,246,-37.92829,145.19647,7113,0,1,4
3794,3,7.4,3144,3,3,283,-37.858,145.0351,4675,0,1,5
9429,3,13.8,3165,3,1,650,-37.91905,145.05597,10969,0,1,5
1614,3,7.8,3124,3,1,692,-37.8379,145.0849,8920,0,1,5
2559,4,3.5,3068,4,2,266,-37.7797,144.9861,6244,0,4,2


In [17]:
model = RandomForestRegressor(random_state=11)

model.fit(enc_X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=11, verbose=0, warm_start=False)

In [18]:
predictions = model.predict(enc_X_valid)

mae = mean_absolute_error(predictions, y_valid)

mae

170217.87742013816