# Run the import statemnts

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import math

# Get familiar with the data

In [25]:
# Store data in df
melbourne_file_path = 'input/melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path)

# print a summary of the data in Melbourne data
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [26]:
# Show all the properties
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

# Prepare the data to fit into a model

In [9]:
# Drop the raws with missing values
melbourne_data = melbourne_data.dropna(axis=0)
# Get the prediction target
y = melbourne_data.Price
# Get the features to base the prediction on
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


# Fit data into model

In [13]:
# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(X, y)
# Print out the Mean Squared Error - Of course it'll be very small as we are predictig  on the training dataset
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

0.0

# Use training and a validation datasets

In [28]:
# Now build another model, use split the dataset into a training and a validation datasets

# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

273518.01872175594


# How to find a best parameter
## Finidng the best max_leaf_node in our tree

In [29]:
# How to find a best parameter - Finidng the best max_leaf_node in our tree

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

def find_best_max_leaf_node(compute_error_func, train_X, val_X, train_y, val_y, leaf_nodes=[5, 50, 500, 5000]):
    def inner():
        # compare MAE with differing values of max_leaf_nodes
        lowest_error = math.inf
        best_max_leaf_node = None
        for max_leaf_nodes in leaf_nodes:
            my_mae = compute_error_func(max_leaf_nodes, train_X, val_X, train_y, val_y)
            print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
            if my_mae < lowest_error:
                lowest_error = my_mae
                best_max_leaf_node = max_leaf_nodes
        return best_max_leaf_node
    return inner


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
possible_max_nodes = np.linspace(5, 1000, num=40, dtype=int)
best_leaf_node = find_best_max_leaf_node(get_mae, train_X, val_X, train_y, val_y, possible_max_nodes)
best_leaf_node = best_leaf_node()
print(f"The Max Leaf Node that produces the best result is: {best_leaf_node}")

        

Max leaf nodes: 5  		 Mean Absolute Error:  385696
Max leaf nodes: 30  		 Mean Absolute Error:  297673
Max leaf nodes: 56  		 Mean Absolute Error:  279566
Max leaf nodes: 81  		 Mean Absolute Error:  274477
Max leaf nodes: 107  		 Mean Absolute Error:  268692
Max leaf nodes: 132  		 Mean Absolute Error:  269872
Max leaf nodes: 158  		 Mean Absolute Error:  268185
Max leaf nodes: 183  		 Mean Absolute Error:  269206
Max leaf nodes: 209  		 Mean Absolute Error:  270066
Max leaf nodes: 234  		 Mean Absolute Error:  268873
Max leaf nodes: 260  		 Mean Absolute Error:  269784
Max leaf nodes: 285  		 Mean Absolute Error:  268436
Max leaf nodes: 311  		 Mean Absolute Error:  265494
Max leaf nodes: 336  		 Mean Absolute Error:  264717
Max leaf nodes: 362  		 Mean Absolute Error:  264182
Max leaf nodes: 387  		 Mean Absolute Error:  262787
Max leaf nodes: 413  		 Mean Absolute Error:  261155
Max leaf nodes: 438  		 Mean Absolute Error:  262008
Max leaf nodes: 464  		 Mean Absolute Error:  26148