In [1]:
# Import the libraries

import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
# Get the data 
melb = pd.read_csv('melb_data.csv')

# Have a preview
melb.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000,S,Biggin,3/12/2016,2.5,3067,...,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,1035000,S,Biggin,4/2/2016,2.5,3067,...,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,1465000,SP,Biggin,4/3/2017,2.5,3067,...,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,850000,PI,Biggin,4/3/2017,2.5,3067,...,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,1600000,VB,Nelson,4/6/2016,2.5,3067,...,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [3]:
# Understand the datatypes
melb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Price            13580 non-null int64
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null int64
Bedroom2         13580 non-null int64
Bathroom         13580 non-null int64
Car              13518 non-null float64
Landsize         13580 non-null int64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null int64
dtypes: float64(6), int64(7), object(8)
memory usage: 2.2+ MB


In [4]:
# Check features with missing values
melb.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
# Set the target 
y = melb.Price

# Remove the target from the features 
X = melb.drop(['Price'], axis=1)

In [6]:
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067,2,1,1.0,202,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/2/2016,2.5,3067,2,1,0.0,156,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/3/2017,2.5,3067,3,2,0.0,134,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/3/2017,2.5,3067,3,2,1.0,94,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/6/2016,2.5,3067,3,1,2.0,120,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019


In [7]:
X.shape

(13580, 20)

In [8]:
y.head()

0    1480000
1    1035000
2    1465000
3     850000
4    1600000
Name: Price, dtype: int64

In [9]:
# Preview the features
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 20 columns):
Suburb           13580 non-null object
Address          13580 non-null object
Rooms            13580 non-null int64
Type             13580 non-null object
Method           13580 non-null object
SellerG          13580 non-null object
Date             13580 non-null object
Distance         13580 non-null float64
Postcode         13580 non-null int64
Bedroom2         13580 non-null int64
Bathroom         13580 non-null int64
Car              13518 non-null float64
Landsize         13580 non-null int64
BuildingArea     7130 non-null float64
YearBuilt        8205 non-null float64
CouncilArea      12211 non-null object
Lattitude        13580 non-null float64
Longtitude       13580 non-null float64
Regionname       13580 non-null object
Propertycount    13580 non-null int64
dtypes: float64(6), int64(6), object(8)
memory usage: 2.1+ MB


In [10]:
# Exclude the categorical variables from the training features
X = X.select_dtypes(exclude=['object'])

X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067,2,1,1.0,202,,,-37.7996,144.9984,4019
1,2,2.5,3067,2,1,0.0,156,79.0,1900.0,-37.8079,144.9934,4019
2,3,2.5,3067,3,2,0.0,134,150.0,1900.0,-37.8093,144.9944,4019
3,3,2.5,3067,3,2,1.0,94,,,-37.7969,144.9969,4019
4,4,2.5,3067,3,1,2.0,120,142.0,2014.0,-37.8072,144.9941,4019


In [11]:
# Drop all the columns with any missing value
new_X = X.dropna(axis=1)

new_X.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
0,2,2.5,3067,2,1,202,-37.7996,144.9984,4019
1,2,2.5,3067,2,1,156,-37.8079,144.9934,4019
2,3,2.5,3067,3,2,134,-37.8093,144.9944,4019
3,3,2.5,3067,3,2,94,-37.7969,144.9969,4019
4,4,2.5,3067,3,1,120,-37.8072,144.9941,4019


In [12]:
# Now the features are ready for model fitting
# Only numerical values and no missing values
new_X.isnull().sum()

Rooms            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
dtype: int64

In [13]:
# Before fitting 
# Split the training and the validation sets
X_train, X_valid, y_train, y_valid = train_test_split(new_X, y, test_size=0.2, random_state=0)

In [14]:
# Define the model to use
model = DecisionTreeRegressor(random_state=0)

In [15]:
# Fitting the model
model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

In [16]:
# Use the fitted model to make predictions
predictions = model.predict(X_valid)

In [17]:
# Evaluate the model's performance using the metrics of choice
# In this case, the mean_absolute_error
mae = mean_absolute_error(predictions, y_valid)

# Print the mae
mae

238902.49987727051

In [18]:
# The predictions
predictions

array([1400000.,  887000.,  595000., ..., 1100000., 1211000.,  975000.])

In [19]:
# The target validation set
y_valid.head(10)

8505     2165000
5523      815000
12852     610000
4818     1245000
12812    1160000
2153     1250000
9903      715000
5105      812000
6107      750500
8876      610000
Name: Price, dtype: int64