### Imports

In [None]:
%load_ext nb_black

In [None]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import (
    Imputer,
    LabelEncoder,
    OneHotEncoder,
    LabelBinarizer,
    StandardScaler,
)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

### Function Definitions

In [None]:
"""
fetch_housing_data

Fetch the URL - Extract the zipped content

"""


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):

    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)

    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)

    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


"""
load_housing_data

Return the CSV as dataframe

"""


def load_housing_data(housing_path=HOUSING_PATH):
    return pd.read_csv(housing_path + "/housing.csv")


"""
prepare_test_set

Split the dataset into a specific ration
"""

"""
np.random.seed(42)
def prepare_test_set(data, ratio=0.3):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled_indices[:test_set_size]
    training_indices = shuffled_indices[test_set_size:]

    return data.iloc[training_indices], data.iloc[test_indices]
"""

In [None]:
fetch_housing_data()
housing_data = load_housing_data()

housing_data.head()

In [None]:
housing_data.info()

In [None]:
housing_data.describe()

In [None]:
housing_data.hist(bins=50, figsize=(20, 15))

#### Simple data split

In [None]:
housing_data_train, housing_data_test = train_test_split(
    housing_data, test_size=0.2, random_state=42
)
print(
    "Training set size: {}, Test set size: {}".format(
        len(housing_data_train), len(housing_data_test)
    )
)

* A simple `train_test_split()` may not work well for data if there's an unbalanced distribution for any important attribute
* In this case, `median_income` is one such important attribute, which has income groups that vary in size

* Therefore, `median_income` is divided by 1.5 suitably (looking at the histogram) anda new income category is created for the data, that is more evenly distributed and hence lower sampling bias associated

In [None]:
housing_data["median_income"].hist()

In [None]:
housing_data["income_cat"] = pd.cut(
    housing_data["median_income"],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
housing_data["income_cat"].hist()

#### Stratified Shuffle Split

In [None]:
stratSplit = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)

In [None]:
for train_index, test_index in stratSplit.split(
    housing_data, housing_data["income_cat"]
):
    housing_data_train = housing_data.iloc[train_index]
    housing_data_test = housing_data.iloc[test_index]

In [None]:
housing_data.plot(
    kind="scatter",
    x="longitude",
    y="latitude",
    legend=True,
    alpha=0.4,
    s=housing_data["population"] / 100,
    label="Population",
    c="median_house_value",
    colormap=plt.get_cmap("jet"),
    colorbar=True,
    title="Population Density heatmap",
)

In [None]:
corr_matrix = housing_data.corr()

corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
pd.plotting.scatter_matrix(
    housing_data[
        ["median_house_value", "median_income", "total_bedrooms", "housing_median_age"]
    ],
    figsize=(12, 8),
)

#### Imputer

In [None]:
X_train_raw = housing_data_train.drop("median_house_value", axis=1)
y_train = housing_data_train["median_house_value"].copy()

median_imputer = Imputer(strategy="median")

X_train_numeric = X_train_raw.drop("ocean_proximity", axis=1)
median_imputer.fit(X_train_numeric)

print("Median values to be imputed: {}".format(median_imputer.statistics_))

X_train = median_imputer.transform(X_train_numeric)
X_train = pd.DataFrame(X_train, columns=X_train_numeric.columns)

In [None]:
X_train.info()

- Scikit-Learn’s API is remarkably well designed. The main design principles are:

    - **Consistency**. All objects share a consistent and simple interface:
    
        - **Estimators**. Any object that can estimate some parameters based on a dataset is called an estimator (e.g., an imputer is an estimator). The estimation itself is performed by the `fit()` method, and it takes only a dataset as a parameter (or two for supervised learning algorithms; the second dataset contains the labels). Any other parameter needed to guide the estimation process is considered a hyperparameter (such as an imputer ’s strategy ), and it must be set as an instance variable (generally via a constructor parameter).
        
        - **Transformers**. Some estimators (such as an imputer) can also transform a dataset; these are called transformers. Once again, the API is quite simple: the transformation is performed by the `transform()` method with the dataset to transform as a parameter. It returns the transformed dataset. This transformation generally relies on the learned parameters, as is the case for an imputer . All transformers also have a convenience method called `fit_transform()` that is equivalent to calling `fit()` and then `transform()` (but sometimes `fit_transform()` is optimized and runs much faster).
        
        - **Predictors**. Finally, some estimators are capable of making predictions given a dataset; they are called predictors. For example, the `LinearRegression` model in the previous chapter was a predictor: it predicted life satisfaction given a country’s GDP percapita. A predictor has a `predict()` method that takes a dataset of new instances and returns a dataset of corresponding predictions. It also has a `score()` method that measures the quality of the predictions given a test set (and the corresponding labels in the case of supervised learning algorithms).
        
    - **Inspection**. All the estimator’s hyperparameters are accessible directly via public instance variables (e.g., imputer.strategy ),and all the estimator’s learned parameters are also accessible via public instance variables with an underscore suffix (e.g.,`imputer.statistics_` ).
    
    - **Nonproliferation of classes**. Datasets are represented as `NumPy` arrays or `SciPy` sparse matrices, instead of homemade classes. Hyperparameters are just regular Python strings or numbers.
    
    - **Composition**. Existing building blocks are reused as much as possible. For example, it is easy to create a Pipeline estimator from an arbitrary sequence of transformers followed by a final estimator.
    
    - **Sensible defaults**. Scikit-Learn provides reasonable default values for most parameters, making it easy to create a baseline working system quickly.

#### Encoding Categorical Attributes
- LabelEncoder
- OneHotEncoder
    - OneHot accepts LabelEncoded attribute(s) as input
    - OneHot by default returns a Scipy based sparse matrix ; use `.toarray()` to convert it to dense Numpy arrays
- LabelBinarizer
    - Combines above two and the *binarized* form is returned as dense Numpy by default. By setting `sparse_output=True`, the returned array can be made sparse



In [None]:
encoder = LabelEncoder()

X_train_cat = X_train_raw["ocean_proximity"]
X_train_cat_encoded = encoder.fit_transform(X_train_cat)

encoder.classes_

In [None]:
onehot = OneHotEncoder()
X_train_cat_onehot = onehot.fit_transform(X_train_cat_encoded.reshape(-1, 1))

X_train_cat_onehot
X_train_cat_onehot.toarray()

In [None]:
binarizer = LabelBinarizer()

X_train_cat_binarized = binarizer.fit_transform(X_train_cat)

X_train_cat_binarized

#### Custom Transformers

In [None]:
# column_ids
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6


class CombinedHousingAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=False):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        populations_per_household = X[:, population_ix] / X[:, household_ix]

        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[
                X, rooms_per_household, populations_per_household, bedrooms_per_room
            ]

        return np.c_[X, rooms_per_household, populations_per_household]


attribute_adder = CombinedHousingAttributesAdder(add_bedrooms_per_room=True)

X_train_with_extra_attr = attribute_adder.transform(X_train_raw.values)

X_train_with_extra_attr

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributeNames):
        self.attributeNames = attributeNames

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.attributeNames].values

#### Transformation pipeline (with Feature scaling)

In [None]:
numerical_attribs = list(X_train_numeric)
categorical_attribs = ["ocean_proximity"]

num_pipeline = Pipeline(
    [
        ("selector", DataFrameSelector(numerical_attribs)),
        ("imputer", Imputer(strategy="median")),
        ("adder", CombinedHousingAttributesAdder(add_bedrooms_per_room=True)),
        ("scaler", StandardScaler()),
    ]
)
cat_pipeline = Pipeline(
    [
        ("selector", DataFrameSelector(categorical_attribs)),
        ("one_hot_encoder", OneHotEncoder()),
    ]
)

full_pipeline = FeatureUnion(
    transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline)]
)

In [None]:
housing_data_train_X = housing_data_train.drop(columns=["median_house_value"])
housing_data_test_X = housing_data_test.drop(columns=["median_house_value"])

housing_data_train_Y = housing_data_train["median_house_value"].copy()
housing_data_test_Y = housing_data_test["median_house_value"].copy()

In [None]:
X_train_processed = full_pipeline.fit_transform(housing_data_train_X)
X_test_processed = full_pipeline.fit_transform(housing_data_test_X)

model_linear_regression = LinearRegression()
model_linear_regression.fit(X_train_processed, housing_data_train_Y)

predictions_linear_regression = model_linear_regression.predict(X_test_processed)

rmse_linear_regression = np.sqrt(
    mean_squared_error(predictions_linear_regression, housing_data_test_Y)
)

In [None]:
rmse_linear_regression

In [None]:
model_rf = RandomForestRegressor()
model_rf.fit(X_train_processed, housing_data_train_Y)

predictions_rf = model_rf.predict(X_test_processed)

rmse_rf = np.sqrt(mean_squared_error(predictions_rf, housing_data_test_Y))

rf_scores = cross_val_score(
    model_rf,
    X_train_processed,
    housing_data_train_Y,
    cv=10,
    scoring="neg_mean_squared_error",
)
# rmse_rf
np.sqrt(-rf_scores)

#### Hyperparameter fine-tuning 
- GridSearchCV
- RandomizedSearchCV (for larger search spaces)

In [None]:
param_grid = [
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    {"n_estimators": [3, 10], "max_features": [2, 3, 4], "bootstrap": [False]},
]
model_rf_cv = RandomForestRegressor()

grid_search = GridSearchCV(
    model_rf_cv, param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train_processed, housing_data_train_Y)

print("Best Params: {}".format(grid_search.best_params_))
print("Best Estimator: {}".format(grid_search.best_estimator_))

In [None]:
param_dist = {
    "n_estimators": [3, 10, 30],
    "max_features": [2, 4, 6, 8],
    "bootstrap": [True, False],
}

randomized_search = RandomizedSearchCV(
    model_rf_cv, param_distributions=param_dist, scoring="neg_mean_squared_error", cv=5
)

randomized_search.fit(X_train_processed, housing_data_train_Y)

print("Best Params: {}".format(randomized_search.best_params_))
print("Best Estimator: {}".format(randomized_search.best_estimator_))

In [None]:
print("Grid Search based best score: {}".format(grid_search.best_score_))
print("Randomized Search based best score: {}".format(randomized_search.best_score_))

grid_rf_rmse = np.sqrt(-grid_search.best_score_)
random_rf_rmse = np.sqrt(-randomized_search.best_score_)

print("Grid Search RMSE: {}".format(grid_rf_rmse))
print("Randomized Search RMSE: {}".format(random_rf_rmse))

In [None]:
final_model_rf = randomized_search.best_estimator_
final_predictions_rf = final_model_rf.predict(X_test_processed)

final_rmse_rf = np.sqrt(mean_squared_error(housing_data_test_Y, final_predictions_rf))
print("Final RMSE for Random Forest Model: {}".format(final_rmse_rf))

### Trials

In [None]:
from sklearn import svm

model_svr = svm.SVR()
svr_param_dist = {
    "kernel": ["linear", "rbf"],
    "C": [0.5, 1],
    "epsilon": [0.1],
}

randomized_svr = RandomizedSearchCV(
    model_svr,
    param_distributions=svr_param_dist,
    scoring="neg_mean_squared_error",
    cv=5,
)

randomized_svr.fit(X_train_processed, housing_data_train_Y)

In [None]:
random_svr_rmse = np.sqrt(-randomized_svr.best_score_)
print("Randomized Search RMSE for SVR: {}".format(random_svr_rmse))
print("Randomized SVR best estimator: {}".format(randomized_svr.best_estimator_))

In [None]:
final_model_svr = randomized_svr.best_estimator_
final_predictions_svr = final_model_svr.predict(X_test_processed)

final_rmse_svr = np.sqrt(mean_squared_error(housing_data_test_Y, final_predictions_svr))
print("Final RMSE for SVR Model: {}".format(final_rmse_svr))