In [3]:
# Code you have previously used to load data
import pandas as pd

# Path of the file to read
home_data = pd.read_csv("IowaHouse.csv")

In [4]:
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
y = home_data.SalePrice

In [6]:
feature_names = ["LotArea", "YearBuilt", "1stFlrSF", "2ndFlrSF", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd"]

X = home_data[feature_names]

In [7]:
X.describe()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,1971.267808,1162.626712,346.992466,1.565068,2.866438,6.517808
std,9981.264932,30.202904,386.587738,436.528436,0.550916,0.815778,1.625393
min,1300.0,1872.0,334.0,0.0,0.0,0.0,2.0
25%,7553.5,1954.0,882.0,0.0,1.0,2.0,5.0
50%,9478.5,1973.0,1087.0,0.0,2.0,3.0,6.0
75%,11601.5,2000.0,1391.25,728.0,2.0,3.0,7.0
max,215245.0,2010.0,4692.0,2065.0,3.0,8.0,14.0


In [8]:
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [9]:
from sklearn.tree import DecisionTreeRegressor as DTR
#For model reproducibility, set a numeric value for random_state when specifying the model
iowa_model = DTR(random_state=1)

iowa_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [10]:
predictions = iowa_model.predict(X)
print(predictions)

[208500. 181500. 223500. ... 266500. 142125. 147500.]


In [13]:
print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


In [14]:
from sklearn.model_selection import train_test_split as tts
train_X, val_X, train_y, val_y = tts(X,y,random_state=1)

In [16]:
iowa_model = DTR(random_state=1)

In [17]:
iowa_model.fit(train_X,train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [18]:
val_predictions = iowa_model.predict(val_X)

In [19]:
# print the top few validation predictions
print(iowa_model.predict(val_X.head()))
# print the top few actual prices from validation data
print(val_y.head().tolist())

[186500. 184000. 130000.  92000. 164500.]
[231500, 179500, 122000, 84500, 142000]


In [20]:
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y, val_predictions)
print(val_mae)

29652.931506849316


In [22]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DTR(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [23]:
for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  35044
Max leaf nodes: 25  		 Mean Absolute Error:  29016
Max leaf nodes: 50  		 Mean Absolute Error:  27405
Max leaf nodes: 100  		 Mean Absolute Error:  27282
Max leaf nodes: 250  		 Mean Absolute Error:  27893
Max leaf nodes: 500  		 Mean Absolute Error:  29454


In [25]:
#Optimized leaf nodes : 100
best_tree_size = 100

In [28]:
#final model
final_model = DTR(max_leaf_nodes = best_tree_size, random_state=1)

# fit the final model
final_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=100, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')


In [32]:
val_predictions = iowa_model.predict(X)

In [36]:
# print the top few final model prediction
print(final_model.predict(X.head(10)))
# compare with actual prices from validation data
print(y.head(10).tolist())

[209133.65384615 146415.0075188  209133.65384615 143297.46666667
 270325.         142034.21052632 286617.28571429 179327.65
 132913.33333333 130629.        ]
[208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000]


Random Forest Regression

In [39]:
from sklearn.ensemble import RandomForestRegressor as RF

rf_model = RF(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 22762.42931506849




In [42]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    rf_model = RF(max_leaf_nodes=max_leaf_nodes, random_state=1)
    rf_model.fit(train_X, train_y)
    rf_val_predictions = rf_model.predict(val_X)
    rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
    return(rf_val_mae)

In [43]:
for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))



Max leaf nodes: 5  		 Mean Absolute Error:  31713
Max leaf nodes: 25  		 Mean Absolute Error:  24820
Max leaf nodes: 50  		 Mean Absolute Error:  23603
Max leaf nodes: 100  		 Mean Absolute Error:  22838




Max leaf nodes: 250  		 Mean Absolute Error:  22646
Max leaf nodes: 500  		 Mean Absolute Error:  22691


In [44]:
#Max leaf nodes = 250
best_tree_size = 250

In [45]:
#final model
final_model = RF(max_leaf_nodes = best_tree_size, random_state=1)

# fit the final model
final_model.fit(X,y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=250,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [46]:
val_predictions = iowa_model.predict(X)

In [47]:
# print the top few final model prediction
print(final_model.predict(X.head(10)))
# compare with actual prices from validation data
print(y.head(10).tolist())

[200525.20886367 163921.30426439 218459.38017135 149226.67539683
 278288.29134199 137179.91340654 280350.82888889 198422.76590077
 144747.59949495 113510.90911603]
[208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000, 129900, 118000]
