# Select and Train a Model

In [None]:
import pathlib
import requests
import tarfile

import numpy as np
import pandas as pd
from sklearn import model_selection


def download_data(url, data_dir):
    with open(data_dir / "housing.tgz", 'wb') as f:
        response = requests.get(url)
        f.write(response.content)


def extract_data(data_dir):
    with tarfile.open(data_dir / "housing.tgz") as tgz:
        tgz.extractall(path=data_dir)


# load the data
url = "https://github.com/ageron/data/raw/main/housing.tgz"
data_dir = pathlib.Path("./sample_data")
data_dir.mkdir(parents=True, exist_ok=True)

download_data(url, data_dir)
extract_data(data_dir)
housing_df = pd.read_csv(data_dir / "housing" / "housing.csv")

# stratified sampling to match the income distribution
housing_df["income_cat"] = pd.cut(
    housing_df["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[0, 1, 2, 3, 4]
)

train_df, test_df = model_selection.train_test_split(
    housing_df,
    test_size=0.2,
    stratify=housing_df.loc[:, "income_cat"],
    random_state=42
)

train_df.drop("income_cat", axis=1, inplace=True)
test_df.drop("income_cat", axis=1, inplace=True)

# split off the features and the target
train_features_df = train_df.drop("median_house_value", axis=1)
train_targets = train_df.loc[:, "median_house_value"]

In [None]:
from sklearn import base, cluster, compose, impute, metrics, pipeline, preprocessing


class ClusterSimilarity(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        kmeans = cluster.KMeans(
            self.n_clusters,
            n_init=10,
            random_state=self.random_state
        )
        self.kmeans_ = kmeans.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        similarities = (
            metrics.pairwise
                   .rbf_kernel(
                       X,
                       Y=self.kmeans_.cluster_centers_,
                       gamma=self.gamma
                   )
        )
        return similarities

    def get_feature_names_out(self, names=None):
        return [f"cluster_{i:02d}_similarity" for i in range(self.n_clusters)]


def column_ratio(df):
    return df.iloc[:, 0] / df.iloc[:, 1]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out


def make_ratio_pipeline():
    ratio_pipeline = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"),
            preprocessing.FunctionTransformer(column_ratio, feature_names_out=ratio_name),
            preprocessing.StandardScaler(),
            verbose=True
        ).set_output(
            transform="pandas"
        )
    )
    return ratio_pipeline


log_transform_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"),
        preprocessing.FunctionTransformer(np.log, np.exp, feature_names_out="one-to-one"),
        preprocessing.StandardScaler()
    ).set_output(
        transform="pandas"
    )
)

cluster_similarity = (
    ClusterSimilarity(
        n_clusters=10,
        gamma=1.,
        random_state=42
    ).set_output(
        transform="pandas"
    )
)

categorical_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="most_frequent"),
        preprocessing.OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ).set_output(
        transform="pandas"
    )
)

default_numeric_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"),
        preprocessing.StandardScaler(),
        verbose=True
    ).set_output(
        transform="pandas"
    )
)

preprocessing_pipeline = (
    compose.ColumnTransformer(
        [
            ("bedrooms", make_ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
            ("rooms_per_house", make_ratio_pipeline(), ["total_rooms", "households"]),
            ("people_per_house", make_ratio_pipeline(), ["population", "households"]),
            ("log", log_transform_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
            ("geo", cluster_similarity, ["latitude", "longitude"]),
            ("categorical", categorical_pipeline, compose.make_column_selector(dtype_include=object)),
        ],
        n_jobs=-1,
        remainder=default_numeric_pipeline,
        verbose=True
    ).set_output(
        transform="pandas"
    )
)


In [None]:
preprocessing_pipeline

## Training and Evaluating on the Training Set

In [None]:
from sklearn import linear_model


linear_regression_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    linear_model.LinearRegression()
)

In [None]:
linear_regression_pipeline

In [None]:
_ = linear_regression_pipeline.fit(train_features_df, train_targets)

Let's try the full preprocessing pipeline on a few training instances:

In [None]:
linear_regression_pipeline.predict(train_features_df)

Compare against the actual values:

In [None]:
from sklearn import metrics

train_predictions = linear_regression_pipeline.predict(train_features_df)

linear_regression_rmse = (
    metrics.mean_squared_error(
        train_targets,
        train_predictions,
        squared=False
    )
)

linear_regression_rmse

In [None]:
from sklearn import tree


decision_tree_pipeline = (
    pipeline.make_pipeline(
        preprocessing_pipeline,
        tree.DecisionTreeRegressor(random_state=42)
    )
)

In [None]:
decision_tree_pipeline

In [None]:
_ = decision_tree_pipeline.fit(train_features_df, train_targets)

In [None]:
train_predictions = decision_tree_pipeline.predict(train_features_df)

decision_tree_rmse = (
    metrics.mean_squared_error(
        train_targets,
        train_predictions,
        squared=False
    )
)

decision_tree_rmse

## Better Evaluation Using Cross-Validation

In [None]:
decision_tree_scores = (
    model_selection.cross_val_score(
        decision_tree_pipeline,
        train_features_df,
        train_targets,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )
)

In [None]:
decision_tree_rmse = pd.Series(-decision_tree_scores, name="rmse")

In [None]:
decision_tree_rmse.describe()

In [None]:
linear_regression_scores = (
    model_selection.cross_val_score(
        linear_regression_pipeline,
        train_features_df,
        train_targets,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )
)

In [None]:
linear_regression_rmse = pd.Series(-linear_regression_scores, name="rmse")
linear_regression_rmse.describe()

In [None]:
from sklearn import ensemble

random_forest_pipeline = (
    pipeline.make_pipeline(
        preprocessing_pipeline,
        ensemble.RandomForestRegressor(random_state=42),
        verbose=True
    )
)

random_forest_scores = (
    model_selection.cross_val_score(
        random_forest_pipeline,
        train_features_df,
        train_targets,
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1
    )
)

In [None]:
random_forest_rmse = pd.Series(-random_forest_scores, name="rmse")
random_forest_rmse.describe()

Let's compare this RMSE measured using cross-validation (the "validation error") with the RMSE measured on the training set (the "training error"):

In [None]:
_ = random_forest_pipeline.fit(train_features_df, train_targets)

predictions = random_forest_pipeline.predict(train_features_df)
random_forest_rmse = (
    metrics.mean_squared_error(
        train_targets,
        predictions,
        squared=False
    )
)

random_forest_rmse

The training error is much lower than the validation error, which usually means that the model has overfit the training set. Another possible explanation may be that there's a mismatch between the training data and the validation data, but it's not the case here, since both came from the same dataset that we shuffled and split in two parts.