In [1]:
# The environment with all the libraries
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
# The data
melb = pd.read_csv('melb_data.csv')

melb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
# Separate target from the features
y = melb.Price

X = melb.drop(['Price'], axis=1)

In [4]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067,2,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/2/2016,2.5,3067,2,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/3/2017,2.5,3067,3,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/3/2017,2.5,3067,3,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/6/2016,2.5,3067,3,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [5]:
# Cols with missing values
X.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

The numerical columns BuildingArea and YearBuilt have nearly half the data missing

In [6]:
# Exclude the categorical variables
X = X.select_dtypes(exclude='object')
X.head(10)

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067,2,1,1.0,202,,,-37.7996,144.9984,4019
1,2,2.5,3067,2,1,0.0,156,79.0,1900.0,-37.8079,144.9934,4019
2,3,2.5,3067,3,2,0.0,134,150.0,1900.0,-37.8093,144.9944,4019
3,3,2.5,3067,3,2,1.0,94,,,-37.7969,144.9969,4019
4,4,2.5,3067,3,1,2.0,120,142.0,2014.0,-37.8072,144.9941,4019
5,2,2.5,3067,2,1,0.0,181,,,-37.8041,144.9953,4019
6,3,2.5,3067,4,2,0.0,245,210.0,1910.0,-37.8024,144.9993,4019
7,2,2.5,3067,2,1,2.0,256,107.0,1890.0,-37.806,144.9954,4019
8,1,2.5,3067,1,1,1.0,0,,,-37.8008,144.9973,4019
9,2,2.5,3067,3,1,2.0,220,75.0,1900.0,-37.801,144.9989,4019


In [7]:
# Set the imputer and the strategy. 
# Set to mean by default
imputer = SimpleImputer(strategy='constant')

In [8]:
# Impute the missing values
imputed_X = pd.DataFrame(imputer.fit_transform(X))

imputed_X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,2.0,2.5,3067.0,2.0,1.0,1.0,202.0,0.0,0.0,-37.7996,144.9984,4019.0
1,2.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3.0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3.0,2.5,3067.0,3.0,2.0,1.0,94.0,0.0,0.0,-37.7969,144.9969,4019.0
4,4.0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0
5,2.0,2.5,3067.0,2.0,1.0,0.0,181.0,0.0,0.0,-37.8041,144.9953,4019.0
6,3.0,2.5,3067.0,4.0,2.0,0.0,245.0,210.0,1910.0,-37.8024,144.9993,4019.0
7,2.0,2.5,3067.0,2.0,1.0,2.0,256.0,107.0,1890.0,-37.806,144.9954,4019.0
8,1.0,2.5,3067.0,1.0,1.0,1.0,0.0,0.0,0.0,-37.8008,144.9973,4019.0
9,2.0,2.5,3067.0,3.0,1.0,2.0,220.0,75.0,1900.0,-37.801,144.9989,4019.0


Imputation removes the column names

In [9]:
# Note that the new dataframe has not columns
# Gotta put them back
imputed_X.columns = X.columns

imputed_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2.0,2.5,3067.0,2.0,1.0,1.0,202.0,0.0,0.0,-37.7996,144.9984,4019.0
1,2.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0
2,3.0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.8093,144.9944,4019.0
3,3.0,2.5,3067.0,3.0,2.0,1.0,94.0,0.0,0.0,-37.7969,144.9969,4019.0
4,4.0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.8072,144.9941,4019.0


In [10]:
# Splitting the training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(imputed_X, y, test_size=0.2, random_state=12)

In [11]:
# Define the model and train it
model = DecisionTreeRegressor(random_state=12)

model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=12, splitter='best')

In [12]:
# Make predictions and evaluate model performance
predictions = model.predict(X_valid)

mae = mean_absolute_error(predictions, y_valid)

mae

245182.45360824742


Note the model did not improve performance.
 In fact it did slightly worse than model_one.

In [13]:
# Find the optimum maximum leaf nodes
# Define a function that returns a different mae depending on the maximum leaf nodes used
# Then use a for loop that loops through an array of different values for leaf nodes

# The function

def get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_test):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=11)
    model.fit(X_train, y_train)
    predictions = model.predict(X_valid)
    
    # Evaluate model's performance for every value
    mae = mean_absolute_error(predictions, y_valid)
    
    return mae

In [14]:
# For loop

for max_leaf_nodes in [170, 280, 350, 500, 750, 1000]:
    the_mae = get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_valid)
    
    # Print the corresponding mean_absolute_error for each value
    
    print('The Max_leaf_nodes: {} \t\t\t\t  The mae: {}'.format(max_leaf_nodes, the_mae))

The Max_leaf_nodes: 170 				  The mae: 235243.05162078494
The Max_leaf_nodes: 280 				  The mae: 231647.14798345327
The Max_leaf_nodes: 350 				  The mae: 229774.7203162147
The Max_leaf_nodes: 500 				  The mae: 228763.88433276513
The Max_leaf_nodes: 750 				  The mae: 229740.80558345353
The Max_leaf_nodes: 1000 				  The mae: 232201.57150604881


### Note the model did not perform better than when the columns were dropped.