# Initial Setup

In [28]:
# Code used to load data
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

# Path of the file to read
iowa_file_path = 'datasets/home_data/train.csv'

home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[feature_columns]

# Specify Model
iowa_model = DecisionTreeRegressor()
# Fit Model
iowa_model.fit(X, y)

print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


In [45]:
# Import the train_test_split function
from sklearn.model_selection import train_test_split

# perform split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1,test_size=0.2)

# Decision Tree Regression

In [46]:
# Specify the model
iowa_model = DecisionTreeRegressor(random_state=1)

# Fit iowa_model with the training data.
iowa_model.fit(train_X, train_y)

In [39]:
# Predict with all validation observations
val_predictions = iowa_model.predict(val_X)

In [40]:
# print the top few validation predictions
print(pd.DataFrame(val_predictions).head())
# print the top few actual prices from validation data
print(val_y.head())

          0
0  335000.0
1  141500.0
2  129000.0
3  207500.0
4  112000.0
529    200624
491    133000
459    110000
279    192000
655     88000
Name: SalePrice, dtype: int64


In [41]:
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y,val_predictions)

# Show mean absolute error
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 34,064


In [47]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):

    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

In [48]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
scores = {}
# Loop to find the ideal tree size from candidate_max_leaf_nodes
for max_leaf_nodes in candidate_max_leaf_nodes:
    
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    scores.update({max_leaf_nodes: my_mae})
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)

Max leaf nodes: 5  		 Mean Absolute Error:  35436
Max leaf nodes: 25  		 Mean Absolute Error:  29324
Max leaf nodes: 50  		 Mean Absolute Error:  28003
Max leaf nodes: 100  		 Mean Absolute Error:  27935
Max leaf nodes: 250  		 Mean Absolute Error:  29320
Max leaf nodes: 500  		 Mean Absolute Error:  29789


In [49]:
print('Optimum Tree Size: ', best_tree_size)

Optimum Tree Size:  100


In [50]:
# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

Validation MAE for best value of max_leaf_nodes: 27,936


# Random Forest Regression

In [51]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# predict on validation set
rf_pred = rf_model.predict(val_X)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, rf_pred)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


Validation MAE for Random Forest Model: 22255.863997064578
