In [19]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [8]:
# Save the data file path
melbourne_file_path = './melb_data.csv'

# Read the data
melbourne_data = pd.read_csv(melbourne_file_path)

# Print a summary of the data
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Latitude,Longitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [9]:
# View the columns
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Latitude',
       'Longitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [10]:
# Remove null values
melbourne_data = melbourne_data.dropna(axis=0)

In [11]:
# Prediction target
y = melbourne_data.Price

In [12]:
# Features for the model
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Latitude', 'Longitude']
X = melbourne_data[melbourne_features]

# Print a summary of the features
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Latitude,Longitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [13]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Latitude,Longitude
1,2,1,156,-37.8079,144.9934
2,3,2,134,-37.8093,144.9944
4,4,1,120,-37.8072,144.9941
6,3,2,245,-37.8024,144.9993
7,2,1,256,-37.806,144.9954


In [23]:
# Split dataset into train and test

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [24]:
# Build the model
melbourne_model = DecisionTreeRegressor()

# Fit the model
melbourne_model.fit(train_X, train_y)

DecisionTreeRegressor()

In [26]:
# Making predictions
print("Predictions for the following 5 houses:")
print(val_X.head())
print("Predictions")
print(melbourne_model.predict(val_X.head()))

Predictions for the following 5 houses:
      Rooms  Bathroom  Landsize  Latitude  Longitude
6048      3         3       221 -37.77080   144.8401
9186      4         2       528 -37.83539   145.0431
3991      3         2         0 -37.80950   144.9691
5829      3         2      1039 -37.86380   144.9820
3616      6         6      1334 -37.80290   145.0267
Predictions
[ 503000. 1650000.  760000.  920000. 3850000.]


In [27]:
# Prediction on validation data
val_predictions = melbourne_model.predict(val_X)

# Calculate Mean Absolute Error
print(mean_absolute_error(val_y, val_predictions))

257305.37249838604


In [30]:
# Create a utility function to compare MAE scores for different max_leaf_nodes
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [44]:
# Comparing different MAE's for different max_leaf_nodes
lowest_mae = float('inf')
leaf_node = 0
for max_leaf_nodes in [5, 10, 50, 500, 5000]:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: {} \t\t Mean Absolute Error: {}".format(max_leaf_nodes, mae))
    if lowest_mae>mae:
        lowest_mae = mae
        leaf_node = max_leaf_nodes
print("\nMax leaf nodes: {} gives the lowest Mean Absolute Error: {}".format(leaf_node, lowest_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 369673.0400167675
Max leaf nodes: 10 		 Mean Absolute Error: 320570.2079227018
Max leaf nodes: 50 		 Mean Absolute Error: 266644.21831092256
Max leaf nodes: 500 		 Mean Absolute Error: 243613.31456921576
Max leaf nodes: 5000 		 Mean Absolute Error: 256227.639767592

Max leaf nodes: 500 gives the lowest Mean Absolute Error: 243613.31456921576


In [45]:
# Build the final model

final_model = DecisionTreeRegressor(max_leaf_nodes=leaf_node, random_state=0)

final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)