In [4]:
# get and prepare data
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from load_housing_data import load_housing_data
from split_train_test import split_train_test_strat
from preprocess_data import create_pipeline
housing_full = load_housing_data() # get data
strat_train_set, strat_test_set = split_train_test_strat(housing_full, 0.2, remove_cat=True) #split data
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
full_pipeline = create_pipeline(housing)
housing_prepared = full_pipeline.fit_transform(housing) # preprocess data

In [None]:
# linear regression model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
lin_reg.predict(some_data_prepared)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))
# get error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("RMSE:",lin_rmse)

In [None]:
# decision tree model
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("RMSE:",tree_rmse)

In [None]:
# evaluate with cross-validation
# decision tree model
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
print("### Decision Tree Model ###")
print("Scores:", tree_rmse_scores)
print("Mean:", tree_rmse_scores.mean())
print("Standard deviation:", tree_rmse_scores.std())

# linear regression model
print("")
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
print("### Linear Regression Model ###")
print("Scores:", lin_rmse_scores)
print("Mean:", lin_rmse_scores.mean())
print("Standard deviation:", lin_rmse_scores.std())

In [8]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
print("### Random Forest Model ###")
print("RMSE:",forest_rmse)
print("Scores:", forest_rmse_scores)
print("Mean:", forest_rmse_scores.mean())
print("Standard deviation:", forest_rmse_scores.std())

### Random Forest Model ###
RMSE: 18696.805459159288
Scores: [49186.19171334 47478.38283589 49693.18864527 52487.66495968
 49721.62461591 53367.78057844 49087.90122396 48089.31449078
 53219.46187941 50693.61539868]
Mean: 50302.51263413594
Standard deviation: 1978.292889242589


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
    scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

{'max_features': 6, 'n_estimators': 30}
RandomForestRegressor(max_features=6, n_estimators=30)
65439.981694325106 {'max_features': 2, 'n_estimators': 3}
55565.02373837062 {'max_features': 2, 'n_estimators': 10}
53002.712554244965 {'max_features': 2, 'n_estimators': 30}
61248.122592698106 {'max_features': 4, 'n_estimators': 3}
53099.59596269557 {'max_features': 4, 'n_estimators': 10}
50214.92965573183 {'max_features': 4, 'n_estimators': 30}
58821.449005313436 {'max_features': 6, 'n_estimators': 3}
52445.97986151492 {'max_features': 6, 'n_estimators': 10}
50046.663965280204 {'max_features': 6, 'n_estimators': 30}
58772.185314120354 {'max_features': 8, 'n_estimators': 3}
52147.32762326264 {'max_features': 8, 'n_estimators': 10}
50091.12542642831 {'max_features': 8, 'n_estimators': 30}
62621.0615429922 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54232.378064182696 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59998.25156193834 {'bootstrap': False, 'max_features

In [11]:
# attribute importance
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
num_attribs = list(housing.drop("ocean_proximity", axis=1))
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3626678059577339, 'median_income'),
 (0.14711232471098867, 'INLAND'),
 (0.1132967657016451, 'pop_per_hhold'),
 (0.07231957988658588, 'bedrooms_per_room'),
 (0.07147764394143201, 'longitude'),
 (0.06535715696214901, 'latitude'),
 (0.045846725342648366, 'rooms_per_hhold'),
 (0.042250379584616256, 'housing_median_age'),
 (0.01748409825120641, 'population'),
 (0.016831263172775238, 'total_rooms'),
 (0.016633239768317307, 'households'),
 (0.016602148542185773, 'total_bedrooms'),
 (0.005126108119813327, '<1H OCEAN'),
 (0.005024119107436379, 'NEAR OCEAN'),
 (0.0018839723526366844, 'NEAR BAY'),
 (8.666859782967772e-05, 'ISLAND')]

In [16]:
# evauate in test set
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print("RMSE:",final_rmse)
# 95% confident interal
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
conf95 = np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
    loc=squared_errors.mean(), scale=stats.sem(squared_errors)))
print("95% confident interval:", list(conf95))

RMSE: 48167.09792720746
95% confident interval: [46113.96735506444, 50136.22104062501]
