# Transform Data

* In this notebook we'll create custom transformers and pipelines to transform the raw `training` data into transformed data for ML training.
* We'll use the same pipeline to transform the data for prediction as well. 

## Import Libraries

In [121]:
## import the necessary libraries
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from scipy.signal import find_peaks
from scipy.stats import gaussian_kde
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans



## Load Training Data

In [122]:
processed_data_path = Path("..", "data", "processed", "housing")

In [123]:
## read data
data = pd.read_csv(Path(processed_data_path, "train_set.csv"))

## Split Features & Labels

In [124]:
## before we create the pipeline lets split he training data into features and labels
df_features = data.drop("median_house_value", axis=1)
df_labels = data["median_house_value"].copy()

## Numeric Transformations

### Fill in missing values
* Using `SimpleImputer` with `median` strategy to fill in all the missing values

In [125]:
numeric_impute_pipeline = Pipeline([
    ("impute categories", SimpleImputer(strategy="median")),
])

In [126]:
## uncomment the code below to test the pipeline
# df = pd.DataFrame({'a': [1,np.nan,3], 'b': [10, 30, 40], 'c': [10, 10, 10]})
# numeric_impute_pipeline.fit_transform(df)


### Feature Engineering 

#### Ratios
* `rooms_per_house` - `total_rooms` to `households` ratio
* `bedroom_room_ratio` - `total_bedrooms` to `total_rooms` ration
* `people_per_house` - `population` to `households` ratio
* We'll create a simple `Function Transformer` that calculates the ratio between first two columns
* `Hyperparameters` : N/A

In [127]:
def calculate_ratio_feature_names(function_transformer, feature_names_in):
    """Calculate the ratio feature names."""
    # feature_name = f"{feature_names_in[0]}_to_{feature_names_in[1]}_ratio"
    return ["ratio"]  # feature names out

def calculate_ratio(df):
    """Calculate the ratio of the first two columns."""
    # if not df.iloc[:, [0, 1]].map(np.isreal).all().all():
    #     raise ValueError("Columns are not numbers")
    # feature_names = df.iloc[:, [0, 1]].columns
    # col_name = calculate_ratio_feature_names(calculate_ratio, feature_names)
    result = df[:, [0]] / df[:, [1]]
    # result.columns = col_name
    return result

feature_ratio_transformer = FunctionTransformer(calculate_ratio, feature_names_out=calculate_ratio_feature_names)

In [128]:
# ## uncomment the code to quickly test to validate the transformer
# df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 30, 40], 'c': [10, 10, 10]})
# ## creating a column transfomer
# test_transformer = ColumnTransformer([
#     ("test", feature_ratio_transformer, ["a", "b"])
# ])
# pd.DataFrame(test_transformer.fit_transform(df), columns=test_transformer.get_feature_names_out())

#### Handling Multimodal Data
* `housing_median_age` is a multi modal distribution, so we'll calculate similarity to all the modes.
* We can either use a static list of modes to calculate the similarity in that case we can use simple `Function Transformer`
* We can use dynamic list of modes using gaussian KDE everytime, in this case we can use `Transformer Class` to `fit` and `transform`
* `Hyperparameters` : `gamma`

In [129]:
class MultimodalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, gamma=0.1):
        self.n_clusters = n_clusters
        self.gamma = gamma

    def fit(self, X, y=None):
        checked_X = check_array(X)
        self.X_ = checked_X
        self.feature_data_ = self.X_[:, 0]
        self.kde_ = gaussian_kde(self.feature_data_)
        self.x_grid_ = np.linspace(self.feature_data_.min(), self.feature_data_.max(), 1000)
        self.kde_values_ = self.kde_.evaluate(self.x_grid_)
        self.peaks_, _ = find_peaks(self.kde_values_)
        self.peaks_values_ = self.x_grid_[self.peaks_]
        return self

    def transform(self, X):
        check_is_fitted(self, ['X_', 'feature_data_', 'kde_', 'x_grid_', 'kde_values_', 'peaks_'])
        checked_X = check_array(X)
        feature_data = checked_X[:, 0].reshape(-1, 1)
        similarity_matrix = np.zeros((feature_data.shape[0], len(self.peaks_)))
        for i, peak in enumerate(self.peaks_):
            peak_value = self.x_grid_[peak]
            similarity_matrix[:, i] = rbf_kernel(feature_data, [[peak_value]], gamma=self.gamma).flatten()
        return similarity_matrix

    def get_feature_names_out(self, input_features=None):
        return [f"similarity_to_peak_{round(peak)}" for peak in self.peaks_values_]

In [130]:
## uncomment the code below to test the transformer with mock data
# lets test the transformer with mock data
# df = pd.DataFrame({'a': [1, 2, 2, 2, 3,1 ,4,7,7,8,9,5, 5, 5,5,10, 10, 10,10, 10,10, 10,10, 10,10]})

# cluster_similarity_transformer = MultimodalTransformer(
#     n_clusters=5, gamma=0.1)
# # cluster_similarity_transformer.fit_transform(df)

# pd.DataFrame(cluster_similarity_transformer.fit_transform(df), columns=cluster_similarity_transformer.get_feature_names_out())


#### Location Cluster Similarity
* Since we are dealing with location data, we can find similarity between different locations
* One approach is to identify different cluster, and then calculate similarity between all the datapoints and that cluster. 
* We'll need to create a transformer for that.


In [131]:
class ClusterSimilarityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        check_is_fitted(self, ['kmeans_'])
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, input_features=None):
        return [f"similarity_to_cluster_{i}" for i in range(self.n_clusters)]

### Dropping Outliers
* Custom Class Transformer since we'll need to fit the data using `IsolationForest`
* `Hyperparameters` : `remove_outlier` a boolean hyper parameter to decide whether to drop outlier or not

In [132]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, remove_outliers=True):
        self.remove_outliers = remove_outliers

    def fit(self, X, y=None):
        check_array(X)
        self.iforest_ = IsolationForest(random_state=42)
        self.outlier_prediction_ = self.iforest_.fit_predict(X)
        return self

    def transform(self, X):
        check_is_fitted(self, ['iforest_', 'outlier_prediction_'])
        check_array(X)
        return X.iloc[self.outlier_prediction_ == 1]


In [133]:
## uncomment the code below to test the transformer with mock data
# lets test the transformer with mock data
# df = pd.DataFrame({'a': [1, 2, 2, 2, 3,1 ,4,7,7,8,9,5, 5, 5,5,10, 10, 10,10, 10,10, 10,10, 10,10]})
# outlier_remover = OutlierRemover()
# outlier_remover.fit_transform(df)


* After thinking about this transformer, I realized that since it reduces the number of rows, it could cause problems in the pipelines's downstream transformers. 
* One options is to remove the outliers seperate from the pipeline, second one is to use a feature column to inform downstream transformers, but that might complicate code. 
* For now I'll handle the outliers seperate from the pipeline. 

### Transform Heavy Tailed Features
* Transform heavy tailed features using logarithm 
* Simple Function Transformer to find `np.log` and `np.exp` as inverse transformation.


In [134]:

def heavy_tail_distribution(df):
    """Transform heavy tailed distribution using logarithm."""
    return np.log1p(df)

heavy_tail_transformer = FunctionTransformer(heavy_tail_distribution, inverse_func=np.expm1, feature_names_out="one-to-one")


### Scaling
* Scale all numeric features. 
* Custom Class Transformer since we need to fit and transform

In [135]:
def standard_scaler(df):
    """Scale features using StandardScaler."""
    return StandardScaler().fit_transform(df)

standard_scaler_transformer = FunctionTransformer(standard_scaler)

## Categorical Transformations

* Convert `ocean_proximity` to one hot encoding
* Custom Class Transformer cause we'll need to fit/trasnform using OneHotEncoder

In [136]:
cat_pipeline = Pipeline([
    ("impute_categories", SimpleImputer(strategy="most_frequent")),
    ("encode_categories", OneHotEncoder(sparse_output=False))
])

In [137]:
df_features["ocean_proximity"]

0          NEAR BAY
1         <1H OCEAN
2            INLAND
3            INLAND
4        NEAR OCEAN
            ...    
16507     <1H OCEAN
16508        INLAND
16509    NEAR OCEAN
16510     <1H OCEAN
16511    NEAR OCEAN
Name: ocean_proximity, Length: 16512, dtype: object

In [138]:
## testing to see if pipelines worked
cat_pipeline.fit_transform(df_features.select_dtypes(include=object))

array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [139]:
cat_pipeline.get_feature_names_out()

array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'], dtype=object)

In [140]:
pd.DataFrame(cat_pipeline.fit_transform(df_features.select_dtypes(include=object)), columns=cat_pipeline.get_feature_names_out())

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
16507,1.0,0.0,0.0,0.0,0.0
16508,0.0,1.0,0.0,0.0,0.0
16509,0.0,0.0,0.0,0.0,1.0
16510,1.0,0.0,0.0,0.0,0.0


## Combining the Transformers

We need to following pipelines in a column transformer
* Ratio Pipeline : Simple Imputer + Ratio Function Transformers + Scaling
* Similarity Pipeline : Simple Imputer + Multimodal Similarity (Housing Median Age)
* Similarity Pipeline : Simple Imputer + Cluster Similarity (Lat, Long)
* Log Pipeline : Simple Imputer (Most Frequent) + Log + Scaling
* Category Pipeline : Simple Imputer (most frequent) + Onehot Encoding

In [None]:
ratio_pipeline = Pipeline([
    ("impute_ratios", SimpleImputer(strategy="median")),
    ("calculate_ratios", feature_ratio_transformer),
    ("standard_scaler", StandardScaler())     
])

multimodal_similarity_pipeline = Pipeline([
    ("impute_multimodal", SimpleImputer(strategy="median")),
    ("calculate_multimodal", MultimodalTransformer(n_clusters=5, gamma=0.1)),
    ("standard_scaler", StandardScaler())
])

## most frequent imputer for latitude and longitude
cluster_similarity_pipeline = Pipeline([
    ("impute_cluster", SimpleImputer(strategy="most_frequent")),
    ("calculate_cluster", ClusterSimilarityTransformer(n_clusters=10, gamma=1, random_state=42)),
    ("standard_scaler", StandardScaler())
])

log_pipeline = Pipeline([
    ("impute_log", SimpleImputer(strategy="median")),
    ("log_transform", heavy_tail_transformer),
    ("standard_scaler", StandardScaler())
])

## copy/pasting the pipeline from above for easy access
cat_pipeline = Pipeline([
    ("impute_categories", SimpleImputer(strategy="most_frequent")),
    ("encode_categories", OneHotEncoder(sparse_output=False))
])

## lets create a full pipeline
preprocessing_pipeline = ColumnTransformer([
    ("bedrooms_per_room", ratio_pipeline, ["total_bedrooms", "total_rooms"]),
    ("rooms_per_household", ratio_pipeline, ["total_rooms", "households"]),
    ("population_per_household", ratio_pipeline, ["population", "households"]),
    ("multimodal_similarity", multimodal_similarity_pipeline, ["housing_median_age"]),
    ("cluster_similarity", cluster_similarity_pipeline, ["latitude", "longitude"]),
    ("log_pipeline", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("categorical", cat_pipeline, ["ocean_proximity"])
])    


In [142]:
preprocessed_data = preprocessing_pipeline.fit_transform(df_features)
preprocessed_data.shape


(16512, 27)

In [143]:
preprocessing_pipeline.get_feature_names_out()

array(['bedrooms_per_room__ratio', 'rooms_per_household__ratio',
       'population_per_household__ratio',
       'multimodal_similarity__similarity_to_peak_17',
       'multimodal_similarity__similarity_to_peak_26',
       'multimodal_similarity__similarity_to_peak_35',
       'multimodal_similarity__similarity_to_peak_52',
       'cluster_similarity__similarity_to_cluster_0',
       'cluster_similarity__similarity_to_cluster_1',
       'cluster_similarity__similarity_to_cluster_2',
       'cluster_similarity__similarity_to_cluster_3',
       'cluster_similarity__similarity_to_cluster_4',
       'cluster_similarity__similarity_to_cluster_5',
       'cluster_similarity__similarity_to_cluster_6',
       'cluster_similarity__similarity_to_cluster_7',
       'cluster_similarity__similarity_to_cluster_8',
       'cluster_similarity__similarity_to_cluster_9',
       'log_pipeline__total_bedrooms', 'log_pipeline__total_rooms',
       'log_pipeline__population', 'log_pipeline__households',
  

Lessons Learnt:
* Transformers + Piplelines can be tricky specially with column names and expected datatypes. To build more resilient and production ready pipelines we'll need to add a lot of checks and validation.
* Naming is equally important, the column names after preprocessing are not so ideal. 