**Chapter 2 – End-to-end Machine Learning project**

# Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

## Get the Data

In [2]:
import os

HOUSING_PATH = os.path.join("datasets", "housing")
BEST_MODEL_PATH = os.path.join(".", "best_model")

In [3]:
import pandas as pd

def load_test_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "test_set.csv")
    return pd.read_csv(csv_path)

In [4]:
test_set = load_test_data()

## Custom Transformers

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    test_set.columns.get_loc(c) - 1 for c in col_names] # get the column indices

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

## Model persistence using joblib

### Utilities

In [6]:
def display_metric_scores(metric, scores):
    print("Metric:", metric)
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [7]:
import joblib

data_pipeline = joblib.load(BEST_MODEL_PATH + "/data_pipeline.pkl")
model_loaded = joblib.load(BEST_MODEL_PATH + "/best_model.pkl")

  from pandas import MultiIndex, Int64Index


## Test the model

In [8]:
from sklearn.metrics import mean_squared_error

X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

X_test_prepared = data_pipeline.transform(X_test)
final_predictions_loaded = model_loaded.predict(X_test_prepared)

final_mse_loaded = mean_squared_error(y_test, final_predictions_loaded)
final_rmse_loaded = np.sqrt(final_mse_loaded)

print("RMSE: " + str(final_rmse_loaded))

RMSE: 44072.60145502183
