# Explicação

In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, train_y, val_X, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    predict_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, predict_val)
    return mae

In [6]:
# Data Loading Code Runs At This Point
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
melbourne_path = '../02_basic_data_exploration/melb_data.csv'

melbourne_data = pd.read_csv(melbourne_path)

# Filter rows with missing values
updated_melbourne_data = melbourne_data.dropna(axis = 0)
print(updated_melbourne_data.head())

# Choose target and features
y = updated_melbourne_data.Price

features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']

X = updated_melbourne_data[features]

# split data into training and validation data, for both features and target
train_X, train_y, val_X, val_y = train_test_split(X, y, random_state=0)

       Suburb          Address  Rooms Type      Price Method SellerG  \
1  Abbotsford  25 Bloomburg St      2    h  1035000.0      S  Biggin   
2  Abbotsford     5 Charles St      3    h  1465000.0     SP  Biggin   
4  Abbotsford      55a Park St      4    h  1600000.0     VB  Nelson   
6  Abbotsford     124 Yarra St      3    h  1876000.0      S  Nelson   
7  Abbotsford    98 Charles St      2    h  1636000.0      S  Nelson   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
1  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
2  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   
4  4/06/2016       2.5    3067.0  ...       1.0  2.0     120.0         142.0   
6  7/05/2016       2.5    3067.0  ...       2.0  0.0     245.0         210.0   
7  8/10/2016       2.5    3067.0  ...       1.0  2.0     256.0         107.0   

   YearBuilt  CouncilArea Lattitude  Longtitude             Regionname  \
1     1900.0

In [8]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 500  		 Mean Absolute Error:  243495
Max leaf nodes: 5000  		 Mean Absolute Error:  254983
