# Prepare the Data for Machine Learning Algorithms

## Download the Data

In [None]:
import pathlib
import requests
import tarfile

import numpy as np
import pandas as pd
from sklearn import model_selection


def download_data(url, data_dir):
    with open(data_dir / "housing.tgz", 'wb') as f:
        response = requests.get(url)
        f.write(response.content)


def extract_data(data_dir):
    with tarfile.open(data_dir / "housing.tgz") as tgz:
        tgz.extractall(path=data_dir)


# load the data
url = "https://github.com/ageron/data/raw/main/housing.tgz"
data_dir = pathlib.Path("./sample_data")
data_dir.mkdir(parents=True, exist_ok=True)

download_data(url, data_dir)
extract_data(data_dir)
housing_df = pd.read_csv(data_dir / "housing" / "housing.csv")

# stratified sampling to match the income distribution
housing_df["income_cat"] = pd.cut(
    housing_df["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[0, 1, 2, 3, 4]
)

train_df, test_df = model_selection.train_test_split(
    housing_df,
    test_size=0.2,
    stratify=housing_df.loc[:, "income_cat"],
    random_state=42
)

train_df.drop("income_cat", axis=1, inplace=True)
test_df.drop("income_cat", axis=1, inplace=True)

In [None]:
# split off the features and the target
train_features_df = train_df.drop("median_house_value", axis=1)
train_targets = train_df.loc[:, "median_house_value"]

In [None]:
train_features_df.info()

## Data Cleaning

In [None]:
from sklearn import impute


imputer = (
    impute.SimpleImputer(strategy="median")
          .set_output(transform="pandas")
)

In [None]:
imputer

Separating out the numerical attributes to use the `"median"` strategy (as it cannot be calculated on text attributes like `ocean_proximity`):

In [None]:
numeric_features_df = train_features_df.select_dtypes(include=[np.number])
_ = imputer.fit(numeric_features_df)

In [None]:
imputer.statistics_

Check that this is the same as manually computing the median of each attribute:

In [None]:
numeric_features_df.median()

Transform the training set:

In [None]:
imputed_numeric_features_df = imputer.transform(numeric_features_df)

In [None]:
imputer.feature_names_in_

In [None]:
imputed_numeric_features_df.info()

Now let's drop some outliers:

In [None]:
from sklearn import ensemble


isolation_forest = ensemble.IsolationForest(random_state=42)
isolation_forest.fit_predict(imputed_numeric_features_df)

If you wanted to drop outliers, you would run the following code:

In [None]:
#housing = housing.iloc[outlier_pred == 1]
#housing_labels = housing_labels.iloc[outlier_pred == 1]

## Handling Text and Categorical Attributes

Now let's preprocess the categorical input feature, `ocean_proximity`:

In [None]:
train_df.loc[:, ["ocean_proximity"]]

In [None]:
from sklearn import preprocessing


ordinal_encoder = (
    preprocessing.OrdinalEncoder()
                 .set_output(transform="pandas")
)
ordinal_encoded_ocean_proximity = (
    ordinal_encoder.fit_transform(train_df.loc[:, ["ocean_proximity"]])
)

In [None]:
ordinal_encoded_ocean_proximity

In [None]:
ordinal_encoder.categories_

In [None]:
one_hot_encoder = (
    preprocessing.OneHotEncoder(sparse_output=False)
                 .set_output(transform="pandas")
)
one_hot_encoded_ocean_proximity = (
    one_hot_encoder.fit_transform(train_df.loc[:, ["ocean_proximity"]])
)

In [None]:
one_hot_encoded_ocean_proximity

In [None]:
one_hot_encoder.categories_

## Feature Scaling

In [None]:
min_max_scaler = (
    preprocessing.MinMaxScaler(
      feature_range=(-1, 1)
    ).set_output(
        transform="pandas"
    )
)
min_max_scaled_numeric_features_df = (
    min_max_scaler.fit_transform(numeric_features_df)
)

In [None]:
min_max_scaled_numeric_features_df.describe()

In [None]:
standard_scaler = (
    preprocessing.StandardScaler()
                 .set_output(transform="pandas")
)
standard_scaled_numeric_features_df = (
    standard_scaler.fit_transform(numeric_features_df)
)

In [None]:
standard_scaled_numeric_features_df.describe()

In [None]:
import matplotlib.pyplot as plt


fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)
train_df.loc[:, "population"].hist(ax=axs[0], bins=50)
train_df.loc[:, "population"].apply(np.log).hist(ax=axs[1], bins=50)
axs[0].set_xlabel("Population")
axs[1].set_xlabel("Log of population")
axs[0].set_ylabel("Number of districts")
plt.show()

In [None]:
log_transformer = (
    preprocessing.FunctionTransformer(
        func=np.log,
        inverse_func=np.exp
    )
)
log_population = log_transformer.fit_transform(train_df.loc[:, ["population"]])

In [None]:
log_population

What if we replace each value of `median_income` with its quantile?

In [None]:
_ = (
    train_df.loc[:, "median_income"]
            .hist(bins=50)
)

In [None]:
quantile_transformer = (
    preprocessing.QuantileTransformer(
        n_quantiles=100,
        output_distribution="uniform"
    ).set_output(
        transform="pandas"
    )
)
quantiled_median_income = (
    quantile_transformer.fit_transform(train_df.loc[:, ["median_income"]])
)

In [None]:
_ = quantiled_median_income.hist(bins=50)

## Custom Transformers

In [None]:
from sklearn import base, cluster, metrics


class ClusterSimilarity(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        kmeans = cluster.KMeans(
            self.n_clusters,
            n_init=10,
            random_state=self.random_state
        )
        self.kmeans_ = kmeans.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        similarities = (
            metrics.pairwise
                   .rbf_kernel(
                       X,
                       Y=self.kmeans_.cluster_centers_,
                       gamma=self.gamma
                   )
        )
        return similarities

    def get_feature_names_out(self, names=None):
        return [f"cluster_{i:02d}_similarity" for i in range(self.n_clusters)]

In [None]:
cluster_similarity = (
    ClusterSimilarity(
        n_clusters=10,
        gamma=1.,
        random_state=42
    ).set_output(
        transform="pandas"
    )
)
similarities = (
    cluster_similarity .fit_transform(
        train_df.loc[:, ["latitude", "longitude"]],
        sample_weight=train_targets
    )
)

In [None]:
similarities

## Transformation Pipelines

Now let's build a pipeline to preprocess the numerical attributes:

In [None]:
from sklearn import pipeline


numeric_pipeline = (
    pipeline.Pipeline(
        [
            ("simple_impute", impute.SimpleImputer(strategy="median")),
            ("standard_scaler", preprocessing.StandardScaler()),
        ],
        verbose=True
    ).set_output(
        transform="pandas"
    )
)

In [None]:
numeric_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"),
        preprocessing.StandardScaler(),
        verbose=True
    ).set_output(
        transform="pandas"
    )
)

In [None]:
numeric_pipeline

In [None]:
prepared_numeric_features_df = numeric_pipeline.fit_transform(numeric_features_df)

In [None]:
prepared_numeric_features_df

In [None]:
numeric_pipeline.steps

In [None]:
numeric_pipeline[1]

In [None]:
numeric_pipeline[:-1]

In [None]:
numeric_pipeline.named_steps["simpleimputer"]

In [None]:
numeric_pipeline.set_params(simpleimputer__strategy="median")

In [None]:
from sklearn import compose


numeric_features = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income"
]

categorical_features = [
    "ocean_proximity"
]

categorical_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="most_frequent"),
        preprocessing.OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    ).set_output(
        transform="pandas"
    )
)

preprocessing_pipeline = (
    compose.ColumnTransformer(
        [
            ("numeric_pipeline", numeric_pipeline, numeric_features),
            ("categorical_pipeline", categorical_pipeline, categorical_features),
        ],
        remainder="drop",
        n_jobs=-1,
        verbose=True,
        verbose_feature_names_out=False
    ).set_output(
        transform="pandas"
    )
)

In [None]:
preprocessing_pipeline = (
    compose.make_column_transformer(
        (numeric_pipeline, compose.make_column_selector(dtype_include=np.number)),
        (categorical_pipeline, compose.make_column_selector(dtype_include=object)),
        remainder="drop",
        n_jobs=-1,
        verbose=True,
        verbose_feature_names_out=False
    ).set_output(
        transform="pandas"
    )
)

In [None]:
preprocessing_pipeline

In [None]:
prepared_features_df = preprocessing_pipeline.fit_transform(train_features_df)

In [None]:
prepared_features_df

In [None]:
def column_ratio(df):
    return df.iloc[:, 0] / df.iloc[:, 1]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out


def make_ratio_pipeline():
    ratio_pipeline = (
        pipeline.make_pipeline(
            impute.SimpleImputer(strategy="median"),
            preprocessing.FunctionTransformer(column_ratio, feature_names_out=ratio_name),
            preprocessing.StandardScaler(),
            verbose=True
        ).set_output(
            transform="pandas"
        )
    )
    return ratio_pipeline


log_transform_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"),
        preprocessing.FunctionTransformer(np.log, np.exp, feature_names_out="one-to-one"),
        preprocessing.StandardScaler()
    ).set_output(
        transform="pandas"
    )
)

cluster_similarity = (
    ClusterSimilarity(
        n_clusters=10,
        gamma=1.,
        random_state=42
    ).set_output(
        transform="pandas"
    )
)

default_numeric_pipeline = (
    pipeline.make_pipeline(
        impute.SimpleImputer(strategy="median"),
        preprocessing.StandardScaler(),
        verbose=True
    ).set_output(
        transform="pandas"
    )
)

preprocessing_pipeline = (
    compose.ColumnTransformer(
        [
            ("bedrooms", make_ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
            ("rooms_per_house", make_ratio_pipeline(), ["total_rooms", "households"]),
            ("people_per_house", make_ratio_pipeline(), ["population", "households"]),
            ("log", log_transform_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
            ("geo", cluster_similarity, ["latitude", "longitude"]),
            ("categorical", categorical_pipeline, compose.make_column_selector(dtype_include=object)),
        ],
        n_jobs=-1,
        remainder=default_numeric_pipeline,
        verbose=True
    ).set_output(
        transform="pandas"
    )
)

In [None]:
preprocessing_pipeline

In [None]:
prepared_features_df = preprocessing_pipeline.fit_transform(train_features_df)

In [None]:
prepared_features_df