In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

## Preparing the data

This part is already done in the previous notebooks. To know more details, please read them

In [None]:
housing = pd.read_csv("/home/jupyter/hands-on-ml/data/housing.csv")

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5,3.0, 4.5, 6., np.inf],
                               labels=[1,2,3,4,5])

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_data = housing.loc[train_index]
    strart_test_data = housing.loc[test_index]

In [None]:
for set_ in (strat_train_data, strat_test_data):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_data.drop("median_house_value", axis=1)
housing_labels = strat_train_data["median_house_value"].copy()

In [None]:
imputer = SimpleImputer(strategy="median")

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [None]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix]/X[:, households_ix]
        population_per_household = X[:, population_ix]/X[:, households_ix]
        
        if self.add_bedrooms_per_room:
            beedroms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, beedroms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler()),
])

In [None]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [None]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

## Training and Evaluating on the Training Set

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

Let's try it out on a few instances from the training set

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [None]:
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

It works, although the predictions are not exactly accurate. Let's measure this regression model's RMSE on the whole training set using Scikit-Learn's mean_sqaured_error() function

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

This is better than nothing, but clearly not a great score. This is an example of model underfitting. Let's try a DecisionTreeRegressor. This is a powerful model, capable of finding complex nonlinear relationship in the data.

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

Let's evaluate it on the training set:

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

Could this model really be absolutely perfect? Of course, but it is much more likely that the model has badly overfit the data. We need the model to predict on unseen data but, remember, don't touch the test set!!! Let's use the cross-validation to do it.

## Better Evaluation Using Cross-Validation

In [None]:
scores = cross_val_score(tree_reg, 
                         housing_prepared, 
                         housing_labels,
                         scoring="neg_mean_squared_error",
                         cv=10)

In [None]:
tree_rmse_scores = np.sqrt(-scores)

Scikit-Learn's cross validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opossite of the MSE (i.e, a negative value), which is why the preceding code computes -scores before calculating the sqaured root.

In [None]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("sd: ", scores.std())

In [None]:
display_scores(tree_rmse_scores)

Now, the Decision Tree does not look as good as it did earlier. In fact, it seems to perform worse than the Linear Regression model.Let's compute the same scores for the Linear Regression model.

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Thanks to CV we can conclude that, Decission Tree model performs worse than Linear Regression. 

Let's try one last model now: the RandomForestRegressor. 

In [None]:
forest_reg = RandomForestRegressor()
random_forest_scores = cross_val_score(forest_reg, 
                         housing_prepared, 
                         housing_labels,
                         scoring="neg_mean_squared_error",
                         cv=10)

In [None]:
forest_rmse = np.sqrt(-random_forest_scores)
display_scores(forest_rmse)

Random Forest look very promising!!!

We should save every model we experiment with so that we can come back to any model we want. We have to make sure we save both the hyperparameters and the trained parameters, as well as the cross-validation scores and perhaps the actual predictions as well.

We can easily save Scikit-Learn models by using Python's pickle module or by using the joblib library, which is more efficient at serializing large NumPy arrays.

In [None]:
joblib.dump(forest_reg, "forest_reg.pkl")

## Grid search

We are going to use Scikit Learn's GridSearchCV to search for the value of the hyperparameters. All we neet to do is to tell it which hyperparameters we want it to experiment with and what values to try out, and it will use cross-validation to evaluate all the possible combinations of hyperparameters values. For example, the following code searches for the best combination of hyperparameters values for the RandomForestRegressor.

In [None]:
param_grid = [
    {'n_estimators' : [3, 10, 30], 'max_features' : [2, 4, 6, 8]},
    {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [2, 3, 4]}
]

In [None]:
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,
                           param_grid,
                           scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

Getting the best combination of parameters

In [None]:
grid_search.best_params_

We can also get the best estimator directly:

In [None]:
grid_search.best_estimator_

If GridSearchCV is initialized with refit=True (which is the default), then once it finds the best estimator using cross-validation, it retrains it on the whole training set. This is usually a good idea, since feeding it more data will likely improve its performance.

And of course the evaluation scores are also available

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

## Analyze the Best Models and Their Errors

You will often gain good insights on the problem by inpecting the best models. For example, the RandomForestRegressor can indicate the relative importance of each attribute for making accurate predictions

In [None]:
feature_importance = grid_search.best_estimator_.feature_importances_
feature_importance

Let's display these importance scores next to their corresponding attribute names

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importance, attributes), reverse = True)

With this information, we may want to try dropping some of the less useful features.

## Evaluate the system on the Test Set

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_data.drop("median_house_value", axis=1)
y_test = strat_test_data["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_prediction = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_prediction)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse