In [2]:
import pandas as pd

# Load data
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 

# Filter rows with missing price values
filtered_melbourne_data = melbourne_data.dropna(axis=0)

# Choose target and features
pricesColumnData = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
melbourne_features_data = filtered_melbourne_data[melbourne_features]
melbourne_features_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6196 entries, 1 to 12212
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rooms         6196 non-null   int64  
 1   Bathroom      6196 non-null   float64
 2   Landsize      6196 non-null   float64
 3   BuildingArea  6196 non-null   float64
 4   YearBuilt     6196 non-null   float64
 5   Lattitude     6196 non-null   float64
 6   Longtitude    6196 non-null   float64
dtypes: float64(6), int64(1)
memory usage: 387.2 KB


In [2]:
from sklearn.tree import DecisionTreeRegressor

# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(melbourne_features_data, pricesColumnData)
predicted_home_prices = melbourne_model.predict(melbourne_features_data)
predicted_home_prices[:5]

array([1035000., 1465000., 1600000., 1876000., 1636000.])

In [3]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(pricesColumnData, predicted_home_prices)

434.71594577146544

In [4]:
#dividing data into training data and validation data
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
trainingDataColumns, validationDataColumns, trainingDataValues, validationDataValues = train_test_split(melbourne_features_data, 
                                                                                                        pricesColumnData, random_state = 0)
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(trainingDataColumns, trainingDataValues)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(validationDataColumns)
print(mean_absolute_error(validationDataValues, val_predictions))

259838.9354422208


In [5]:
def get_mean_absolute_error_for(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare error with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mean_absolute_error_for(max_leaf_nodes, trainingDataColumns, validationDataColumns, trainingDataValues, validationDataValues)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 500  		 Mean Absolute Error:  243495
Max leaf nodes: 5000  		 Mean Absolute Error:  255015
