# Transform Data

* In this notebook we'll create custom transformers and pipelines to transform the raw `training` data into transformed data for ML training.
* We'll use the same pipeline to transform the data for prediction as well. 

## Import Libraries

In [1]:
## import the necessary libraries
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from scipy.signal import find_peaks
from scipy.stats import gaussian_kde
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.pipeline import Pipeline




## Load Training Data

In [2]:
processed_data_path = Path("..", "data", "processed", "housing")

In [3]:
## read data
data = pd.read_csv(Path(processed_data_path, "train_set.csv"))

## Split Features & Labels

In [4]:
## before we create the pipeline lets split he training data into features and labels
df_features = data.drop("median_house_value", axis=1)
df_labels = data["median_house_value"].copy()

## Numeric Transformations

### Fill in missing values
* Using `SimpleImputer` with `median` strategy to fill in all the missing values

In [5]:
numeric_impute_pipeline = Pipeline([
    ("impute categories", SimpleImputer(strategy="median")),
])

In [6]:
## uncomment the code below to test the pipeline
# df = pd.DataFrame({'a': [1,np.nan,3], 'b': [10, 30, 40], 'c': [10, 10, 10]})
# numeric_impute_pipeline.fit_transform(df)


### Feature Engineering 

#### Ratios
* `rooms_per_house` - `total_rooms` to `households` ratio
* `bedroom_room_ratio` - `total_bedrooms` to `total_rooms` ration
* `people_per_house` - `population` to `households` ratio
* We'll create a simple `Function Transformer` that calculates the ratio between first two columns
* `Hyperparameters` : N/A

In [7]:
def calculate_ratio_feature_names(function_transformer, feature_names_in):
    feature_name = f"{feature_names_in[0]}_to_{feature_names_in[1]}_ratio"
    return [feature_name]  # feature names out

## a simple transformer function to find the ratios of first two columns
def calculate_ratio(df):
    ## check if first two columns are numbers
    if not df.iloc[:,[0,1]].map(np.isreal).all().all():
        raise ValueError("Columns are not numbers")
    ## create column name from first two columns
    feature_names = df.iloc[:,[0,1]].columns
    col_name = calculate_ratio_feature_names(calculate_ratio, feature_names)
    ## calculate the ratio
    result = pd.DataFrame(df.iloc[:,0] / df.iloc[:,1])
    result.columns = col_name
    return result

feature_ratio_transformer = FunctionTransformer(calculate_ratio, feature_names_out=calculate_ratio_feature_names)

In [8]:
# ## uncomment the code to quickly test to validate the transformer
# df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 30, 40], 'c': [10, 10, 10]})
# ## creating a column transfomer
# test_transformer = ColumnTransformer([
#     ("test", feature_ratio_transformer, ["a", "b"])
# ])
# pd.DataFrame(test_transformer.fit_transform(df), columns=test_transformer.get_feature_names_out())

#### Cluster Similarity
* `housing_median_age` is a multi modal distribution, so we'll calculate similarity to all the modes.
* We can either use a static list of modes to calculate the similarity in that case we can use simple `Function Transformer`
* We can use dynamic list of modes using gaussian KDE everytime, in this case we can use `Transformer Class` to `fit` and `transform`
* `Hyperparameters` : `gamma`

In [72]:
class ClusterSimilarityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, gamma=0.1):
        self.n_clusters = n_clusters
        self.gamma = gamma

    def fit(self, X, y=None):
        # Check if the input array X is valid
        checked_X = check_array(X)

        # Store the checked array in the instance variable
        self.X_ = checked_X

        # Extract the first column of the input data for KDE
        # assuming that the first column is the feature data
        self.feature_data_ = self.X_[:, 0]
        # Perform Kernel Density Estimation on the feature data
        self.kde_ = gaussian_kde(self.feature_data_)

        # Create a grid of 1000 points between the minimum and maximum of the feature data
        self.x_grid_ = np.linspace(
            self.feature_data_.min(), self.feature_data_.max(), 1000)

        # Evaluate the KDE on the grid points
        self.kde_values_ = self.kde_.evaluate(self.x_grid_)
        # Find peaks in the KDE values
        self.peaks_, _ = find_peaks(self.kde_values_)
        # peak values
        self.peaks_values_ = self.x_grid_[self.peaks_]
        # Return the fitted instance
        return self

    def transform(self, X):
        check_is_fitted(self, ['X_', 'feature_data_', 'kde_',
                        'x_grid_', 'kde_values_', 'peaks_'])
        checked_X = check_array(X)
        feature_data = checked_X[:, 0].reshape(-1, 1)  # Use the first column

         # Create an array to store similarity scores for each peak
        similarity_matrix = np.zeros((feature_data.shape[0], len(self.peaks_)))

        # Calculate similarity to each peak and store in the matrix
        for i, peak in enumerate(self.peaks_):
            # Calculate similarity of all samples to the current peak
            peak_value = self.x_grid_[peak]
            similarity_matrix[:, i] = rbf_kernel(feature_data, [[peak_value]], gamma=self.gamma).flatten()
        
        return similarity_matrix

    def get_feature_names_out(self, input_features=None):
        return [f"similarity_to_{round(peak)}" for peak in self.peaks_values_]

In [93]:
## uncomment the code below to test the transformer with mock data
# lets test the transformer with mock data
# df = pd.DataFrame({'a': [1, 2, 2, 2, 3,1 ,4,7,7,8,9,5, 5, 5,5,10, 10, 10,10, 10,10, 10,10, 10,10]})

# cluster_similarity_transformer = ClusterSimilarityTransformer(
#     n_clusters=5, gamma=0.1)
# # cluster_similarity_transformer.fit_transform(df)

# pd.DataFrame(cluster_similarity_transformer.fit_transform(df), columns=cluster_similarity_transformer.get_feature_names_out())


### Dropping Outliers
* Custom Class Transformer since we'll need to fit the data using `IsolationForest`
* `Hyperparameters` : `drop_outlier` a boolean hyper parameter to decide whether to drop outlier or not
### Transform Heavy Tailed Features
* Transform heavy tailed features using logarithm 
* Simple Function Transformer to find `np.log` and `np.exp` as inverse transformation.
### Scaling
* Scale all numeric features. 
* Custom Class Transformer since we need to fit and transform

### Categorical Transformations

* Convert `ocean_proximity` to one hot encoding
* Custom Class Transformer cause we'll need to fit/trasnform using OneHotEncoder

## Transfomer To Fill Missing Values

* After researching a bit I realized we can just use `SimpleImputer` directly into the pipeline without the need to creating a class. 
* So skipping this step. 

## Handling Categorical Values

* Even this is similar to missing values transformer
* We can directly use the `OneHotEncoder` and `SimpleImputer` in the pipeline

In [12]:
cat_pipeline = Pipeline([
    ("impute categories", SimpleImputer(strategy="median")),
])

In [13]:
df_features["ocean_proximity"]

0          NEAR BAY
1         <1H OCEAN
2            INLAND
3            INLAND
4        NEAR OCEAN
            ...    
16507     <1H OCEAN
16508        INLAND
16509    NEAR OCEAN
16510     <1H OCEAN
16511    NEAR OCEAN
Name: ocean_proximity, Length: 16512, dtype: object

In [14]:
## testing to see if pipelines worked
cat_pipeline.fit_transform(df_features.select_dtypes(include=object))

ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'NEAR BAY'

In [None]:
cat_pipeline.get_feature_names_out()

array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'], dtype=object)

In [None]:
pd.DataFrame(cat_pipeline.fit_transform(df_features.select_dtypes(include=object)), columns=cat_pipeline.get_feature_names_out())

Unnamed: 0,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
16507,1.0,0.0,0.0,0.0,0.0
16508,0.0,1.0,0.0,0.0,0.0
16509,0.0,0.0,0.0,0.0,1.0
16510,1.0,0.0,0.0,0.0,0.0
