In [38]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings("ignore")

### Load Data

In [39]:
# Load Data
df = pd.read_csv('../data/housing.csv')

### Create Test Set

In [40]:
df['income_cat'] = pd.cut(df['median_income'],
                          bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                          labels=[1, 2, 3, 4, 5])
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42,
                                       stratify=df['income_cat'])
# Drop 'income_cat' feature
for set_ in (train_set, test_set):
    set_.drop('income_cat', axis=1, inplace=True)

### Separate train_set features and labels

In [41]:
housing_tr = train_set.drop("median_house_value", axis=1)
housing_tr_labels = train_set["median_house_value"].copy()

### Impute Missing Values

In [42]:
def impute_missing_values(data):
    # Get Column names contains nulls
    columns_with_null_values = data.columns[data.isnull().any()].tolist()
    for col in columns_with_null_values:
        if data[col].dtype == 'object':
            imputer = SimpleImputer(strategy='most_frequent')
        else:  # Numeric Columns
            imputer = SimpleImputer(strategy='median')
        data[[col]] = imputer.fit_transform(data[[col]])
    return data


housing_tr = impute_missing_values(housing_tr)

### Remove Outliers

In [43]:
def separate_numerical_categorical(data):
    num_df = data.select_dtypes(include=[np.number])
    cat_df = data.select_dtypes(include=["object"])
    return num_df, cat_df


housing_tr_num, _ = separate_numerical_categorical(housing_tr)

# Find and Drop Outliers
isolation_forest = IsolationForest(random_state=42)
outlier_pred = isolation_forest.fit_predict(housing_tr_num)

housing_tr = housing_tr.iloc[outlier_pred == 1]
housing_tr_labels = housing_tr_labels.iloc[outlier_pred == 1]

### Create Preprocessing Pipeline

* Create Custom Class for finding Clusters

In [44]:
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

* Create Pipeline

In [45]:
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]


def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio,
                            feature_names_out=ratio_name),
        StandardScaler()
    )


log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)

sqrt_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.sqrt, validate=True,
                        inverse_func=np.square, feature_names_out="one-to-one"),
    StandardScaler()
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("room_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["population", "median_income"]),
    ("sqrt", sqrt_pipeline, ["total_bedrooms", "total_rooms", "households"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [46]:
housing_tr_prepared = preprocessing.fit_transform(housing_tr)
housing_tr_prepared.shape

(14396, 24)

In [47]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'room_per_house__ratio',
       'people_per_house__ratio', 'log__population', 'log__median_income',
       'sqrt__total_bedrooms', 'sqrt__total_rooms', 'sqrt__households',
       'geo__Cluster 0 similarity', 'geo__Cluster 1 similarity',
       'geo__Cluster 2 similarity', 'geo__Cluster 3 similarity',
       'geo__Cluster 4 similarity', 'geo__Cluster 5 similarity',
       'geo__Cluster 6 similarity', 'geo__Cluster 7 similarity',
       'geo__Cluster 8 similarity', 'geo__Cluster 9 similarity',
       'cat__ocean_proximity_<1H OCEAN', 'cat__ocean_proximity_INLAND',
       'cat__ocean_proximity_ISLAND', 'cat__ocean_proximity_NEAR BAY',
       'cat__ocean_proximity_NEAR OCEAN', 'remainder__housing_median_age'],
      dtype=object)