In [2]:
# Data Loading Code Hidden Here
import pandas as pd

# Load data
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 
# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and features
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]

from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor()

In [3]:
# Once we have a model, here is how we calculate the mean absolute error:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

434.71594577146544

The Problem with "In-Sample" Scores
The measure we just computed can be called an "in-sample" score. We used a single "sample" of houses for both building the model and evaluating it. Here's why this is bad.

Imagine that, in the large real estate market, door color is unrelated to home price.

However, in the sample of data you used to build the model, all homes with green doors were very expensive. The model's job is to find patterns that predict home prices, so it will see this pattern, and it will always predict high prices for homes with green doors.

Since this pattern was derived from the training data, the model will appear accurate in the training data.

But if this pattern doesn't hold when the model sees new data, the model would be very inaccurate when used in practice.

Since models' practical value come from making predictions on new data, we measure performance on data that wasn't used to build the model. The most straightforward way to do this is to exclude some data from the model-building process, and then use those to test the model's accuracy on data it hasn't seen before. This data is called validation data.

In [7]:
"""
The scikit-learn library has a function train_test_split to break up the data into two pieces. 
We'll use some of that data as training data to fit the model, and we'll use the other data as validation data 
to calculate mean_absolute_error.
"""
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
print(train_X)
print(val_X)
print(train_y)
print(val_y)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print("MEAN ABSOLUTE ERROR : ",mean_absolute_error(val_y, val_predictions))


       Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  \
10385      3       1.0     206.0         110.0     1980.0  -37.87107   
5805       2       1.0       0.0          73.0     2000.0  -37.85900   
8488       2       1.0    2701.0          79.0     2011.0  -37.81090   
6672       3       1.0     670.0         116.0     1940.0  -37.81340   
776        6       3.0     708.0         275.0     1988.0  -37.91810   
...      ...       ...       ...           ...        ...        ...   
9510       3       1.0     118.0         177.0     1890.0  -37.81351   
6023       5       2.0     661.0         133.0     1960.0  -37.76510   
2960       4       2.0     453.0         213.0     2007.0  -37.70160   
4729       2       1.0      90.0         106.0     2007.0  -37.83570   
4996       3       1.0     495.0         100.0     1950.0  -37.75210   

       Longtitude  
10385   145.04991  
5805    144.97670  
8488    144.86840  
6672    144.87450  
776     145.04400  
...           .