# Feature Engineering Notebook
## Objectives

## Inputs
- outputs/datasets/cleaned/TrainSetCleaned.csv
- outputs/datasets/cleaned/TestSetCleaned.csv

## Outputs
- 

## Conclusions
- 

---

# Change working directory
We need to change the working directory from its current folder to its parent folder

In [1]:
import os

current_path = os.getcwd()
os.chdir(os.path.dirname(current_path))
current_path = os.getcwd()
current_path

'/workspace/CI_PP5'

# Load Cleaned Data
## Train Set

In [2]:
import pandas as pd
TrainSet = pd.read_csv("outputs/datasets/cleaned/TrainSetCleaned.csv")
TrainSet.head()

Unnamed: 0,1stFlrSF,GarageArea,GarageYrBlt,GrLivArea,KitchenQual,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,1828,774,2007.0,1828,3,9,1822,2007,2007,314813
1,894,308,1962.0,894,2,5,894,1962,1962,109500
2,964,432,1921.0,964,2,5,876,1921,2006,163500
3,1689,857,2002.0,1689,3,8,1568,2002,2002,271000
4,1541,843,2001.0,1541,3,7,1541,2001,2002,205000


## Test Set

In [3]:

TestSet = pd.read_csv("outputs/datasets/cleaned/TestSetCleaned.csv")
TestSet.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GarageYrBlt,...,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,2515,0.0,4.0,0,1219,3.0,816,484,0.0,1975.0,...,32668,69.0,0.0,0,3,6,2035,1957,1975,200624
1,958,620.0,3.0,0,403,4.0,238,240,1.0,1941.0,...,9490,79.0,0.0,0,7,6,806,1941,1950,133000
2,979,224.0,3.0,0,185,2.0,524,352,1.0,1950.0,...,7015,69.0,161.0,0,4,5,709,1950,1950,110000
3,1156,866.0,4.0,0,392,4.0,768,505,3.0,1977.0,...,10005,83.0,299.0,117,5,7,1160,1977,1977,192000
4,525,0.0,3.0,0,0,1.0,525,264,1.0,1971.0,...,1680,21.0,381.0,0,5,6,525,1971,1971,88000


# Data Exploration

In [None]:
from pandas_profiling import ProfileReport
pandas_report = ProfileReport(df=TrainSet, minimal=True)
pandas_report.to_notebook_iframe()

---

# Feature Engineering
- Custom Function from the Code Institute Walkthrough Project 2

In [4]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')


def FeatureEngineeringAnalysis(df, analysis_type=None):
    """
    - Used for quick feature engineering on numerical and categorical variables
    to decide which transformation can better transform the distribution shape
    - Once transformed, use a reporting tool, like ydata-profiling, to evaluate
    distributions
    """
    check_missing_values(df)
    allowed_types = ['numerical', 'ordinal_encoder', 'outlier_winsorizer']
    check_user_entry_on_analysis_type(analysis_type, allowed_types)
    list_column_transformers = define_list_column_transformers(analysis_type)

    # Loop in each variable and engineer the data according to the analysis
    # type
    df_feat_eng = pd.DataFrame([], index=[0])
    for column in df.columns:
        # Create additional columns (column_method) to apply the methods
        df_feat_eng = pd.concat([df_feat_eng, df[column]], axis=1)
        for method in list_column_transformers:
            df_feat_eng[f"{column}_{method}"] = df[column]

        # Apply transformers in respective column_transformers
        df_feat_eng, list_applied_transformers = apply_transformers(
            analysis_type, df_feat_eng, column)

        # For each variable, assess how the transformations perform
        transformer_evaluation(
            df, column, list_applied_transformers, analysis_type, df_feat_eng)

    return df_feat_eng


def check_user_entry_on_analysis_type(analysis_type, allowed_types):
    """ Check analysis type """
    if analysis_type is None:
        raise SystemExit(
            f"You should pass analysis_type parameter as one of the following "
            f"options: {allowed_types}")
    if analysis_type not in allowed_types:
        raise SystemExit(
            f"analysis_type argument should be one of these options: "
            f"{allowed_types}")


def check_missing_values(df):
    """ Check for missing values """
    if df.isna().sum().sum() != 0:
        raise SystemExit(
            f"There is a missing value in your dataset. Please handle that "
            f"before getting into feature engineering.")


def define_list_column_transformers(analysis_type):
    """ Set suffix columns according to analysis_type"""
    if analysis_type == 'numerical':
        list_column_transformers = [
            "log_e", "log_10", "reciprocal", "power", "box_cox", "yeo_johnson"]

    elif analysis_type == 'ordinal_encoder':
        list_column_transformers = ["ordinal_encoder"]

    elif analysis_type == 'outlier_winsorizer':
        list_column_transformers = ['iqr']

    return list_column_transformers


def apply_transformers(analysis_type, df_feat_eng, column):
    """ Apply transformers """
    for col in df_feat_eng.select_dtypes(include='category').columns:
        df_feat_eng[col] = df_feat_eng[col].astype('object')

    if analysis_type == 'numerical':
        df_feat_eng, list_applied_transformers = FeatEng_Numerical(
            df_feat_eng, column)

    elif analysis_type == 'outlier_winsorizer':
        df_feat_eng, list_applied_transformers = FeatEng_OutlierWinsorizer(
            df_feat_eng, column)

    elif analysis_type == 'ordinal_encoder':
        df_feat_eng, list_applied_transformers = FeatEng_CategoricalEncoder(
            df_feat_eng, column)

    return df_feat_eng, list_applied_transformers


def transformer_evaluation(df, column, list_applied_transformers,
                           analysis_type, df_feat_eng):
    """ For each variable, assess how the transformations perform """
    print(f"* Variable Analyzed: {column}")
    print(f"* Applied transformation: {list_applied_transformers}")
    print(f"* Skewness: {df[column].skew().round(2)} | Kurtosis: "
          f"{df[column].kurtosis().round(2)}\n")

    for col in [column] + list_applied_transformers:

        if analysis_type != 'ordinal_encoder':
            DiagnosticPlots_Numerical(df_feat_eng, col)

        else:
            if col == column:
                DiagnosticPlots_Categories(df_feat_eng, col)
            else:
                DiagnosticPlots_Numerical(df_feat_eng, col)

        print("\n")


def DiagnosticPlots_Categories(df_feat_eng, col):
    """ Create plots for categorical transformer """
    plt.figure(figsize=(4, 3))
    sns.countplot(data=df_feat_eng, x=col, palette=[
                  '#432371'], order=df_feat_eng[col].value_counts().index)
    plt.xticks(rotation=90)
    plt.suptitle(f"{col}", fontsize=30, y=1.05)
    plt.show()
    print("\n")


def DiagnosticPlots_Numerical(df, variable):
    """ Create plots for numerical transformer """
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    sns.histplot(data=df, x=variable, kde=True, element="step", ax=axes[0])
    stats.probplot(df[variable], dist="norm", plot=axes[1])
    sns.boxplot(x=df[variable], ax=axes[2])

    axes[0].set_title('Histogram')
    axes[1].set_title('QQ Plot')
    axes[2].set_title('Boxplot')
    fig.suptitle(f"{variable}", fontsize=30, y=1.05)
    plt.tight_layout()
    plt.show()


def FeatEng_CategoricalEncoder(df_feat_eng, column):
    list_methods_worked = []
    try:
        encoder = OrdinalEncoder(encoding_method='arbitrary', variables=[
                                 f"{column}_ordinal_encoder"])
        df_feat_eng = encoder.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_ordinal_encoder")

    except Exception:
        df_feat_eng.drop([f"{column}_ordinal_encoder"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEng_OutlierWinsorizer(df_feat_eng, column):
    list_methods_worked = []

    # Winsorizer iqr
    try:
        disc = Winsorizer(
            capping_method='iqr', tail='both', fold=1.5,
            variables=[f"{column}_iqr"])
        df_feat_eng = disc.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_iqr")
    except Exception:
        df_feat_eng.drop([f"{column}_iqr"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEng_Numerical(df_feat_eng, column):
    list_methods_worked = []

    # LogTransformer base e
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_e"])
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_e")
    except Exception:
        df_feat_eng.drop([f"{column}_log_e"], axis=1, inplace=True)

    # LogTransformer base 10
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_10"], base='10')
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_10")
    except Exception:
        df_feat_eng.drop([f"{column}_log_10"], axis=1, inplace=True)

    # ReciprocalTransformer
    try:
        rt = vt.ReciprocalTransformer(variables=[f"{column}_reciprocal"])
        df_feat_eng = rt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_reciprocal")
    except Exception:
        df_feat_eng.drop([f"{column}_reciprocal"], axis=1, inplace=True)

    # PowerTransformer
    try:
        pt = vt.PowerTransformer(variables=[f"{column}_power"])
        df_feat_eng = pt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_power")
    except Exception:
        df_feat_eng.drop([f"{column}_power"], axis=1, inplace=True)

    # BoxCoxTransformer
    try:
        bct = vt.BoxCoxTransformer(variables=[f"{column}_box_cox"])
        df_feat_eng = bct.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_box_cox")
    except Exception:
        df_feat_eng.drop([f"{column}_box_cox"], axis=1, inplace=True)

    # YeoJohnsonTransformer
    try:
        yjt = vt.YeoJohnsonTransformer(variables=[f"{column}_yeo_johnson"])
        df_feat_eng = yjt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_yeo_johnson")
    except Exception:
        df_feat_eng.drop([f"{column}_yeo_johnson"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked

## Numerical Transformation

In [None]:
df_engineering = TrainSet.copy()
df_engineering.head()
%matplotlib inline
df_engineering = FeatureEngineeringAnalysis(
    df=df_engineering, analysis_type='numerical')

### Comparison
Seems like the results are skewed, so we need to use some other transformation. Let's explore which transformation yields the best results.

In [5]:
import scipy.stats as stats
from sklearn.pipeline import Pipeline

df_engineering = TrainSet.copy()
df_all_transfomers = df_engineering.drop(
    columns=['GarageArea', 'GarageYrBlt', 'TotalBsmtSF'])

def print_results(df):
    for col in df:
        skewness = df[col].skew()
        kurtosis = df[col].kurt()
        print(f'{col} - Skewness: {skewness}, Kurtosis: {kurtosis}')

#### Log

In [6]:
pipeline = Pipeline([
      ('log', vt.LogTransformer())
  ])

df_transformed = pipeline.fit_transform(df_all_transfomers)

print_results(df_transformed)

1stFlrSF - Skewness: -0.0033940831286170814, Kurtosis: 0.014686824240931262
GrLivArea - Skewness: -0.06827974061853226, Kurtosis: 0.17394728553530392
KitchenQual - Skewness: -0.45707412257729374, Kurtosis: 0.981299246303601
OverallQual - Skewness: -1.0895718566109704, Kurtosis: 5.307639739058256
YearBuilt - Skewness: -0.6412427272051278, Kurtosis: -0.366126488333435
YearRemodAdd - Skewness: -0.49881781879784226, Kurtosis: -1.27932044643099
SalePrice - Skewness: 0.030237620059670654, Kurtosis: 0.8427224177523174


#### Power

In [7]:
pipeline = Pipeline([
      ('power', vt.PowerTransformer())
  ])

df_transformed = pipeline.fit_transform(df_engineering)

print_results(df_transformed)

1stFlrSF - Skewness: 0.46309915819724773, Kurtosis: 0.2961737796979871
GarageArea - Skewness: -1.6093471440627554, Kurtosis: 4.008551742559867
GarageYrBlt - Skewness: -0.6842502586141582, Kurtosis: -0.2513785348703732
GrLivArea - Skewness: 0.469996793236193, Kurtosis: 0.5953215942036492
KitchenQual - Skewness: 0.03175894393166758, Kurtosis: -0.02710380481401531
OverallQual - Skewness: -0.29558343385175034, Kurtosis: 1.0381261862705289
TotalBsmtSF - Skewness: -1.4759676656207195, Kurtosis: 6.055496698779596
YearBuilt - Skewness: -0.6271094837568743, Kurtosis: -0.39805175852509356
YearRemodAdd - Skewness: -0.49511108812979787, Kurtosis: -1.2836540055554648
SalePrice - Skewness: 0.8512237087688703, Kurtosis: 1.6748275901451803


#### Box-Cox

In [8]:
pipeline = Pipeline([
      ('bct', vt.PowerTransformer())
  ])

df_transformed = pipeline.fit_transform(df_engineering)

print_results(df_transformed)

1stFlrSF - Skewness: 0.46309915819724773, Kurtosis: 0.2961737796979871
GarageArea - Skewness: -1.6093471440627554, Kurtosis: 4.008551742559867
GarageYrBlt - Skewness: -0.6842502586141582, Kurtosis: -0.2513785348703732
GrLivArea - Skewness: 0.469996793236193, Kurtosis: 0.5953215942036492
KitchenQual - Skewness: 0.03175894393166758, Kurtosis: -0.02710380481401531
OverallQual - Skewness: -0.29558343385175034, Kurtosis: 1.0381261862705289
TotalBsmtSF - Skewness: -1.4759676656207195, Kurtosis: 6.055496698779596
YearBuilt - Skewness: -0.6271094837568743, Kurtosis: -0.39805175852509356
YearRemodAdd - Skewness: -0.49511108812979787, Kurtosis: -1.2836540055554648
SalePrice - Skewness: 0.8512237087688703, Kurtosis: 1.6748275901451803


#### Yeo-Johnson

In [9]:
pipeline = Pipeline([
      ('yj', vt.PowerTransformer())
  ])

df_transformed = pipeline.fit_transform(df_engineering)

print_results(df_transformed)

1stFlrSF - Skewness: 0.46309915819724773, Kurtosis: 0.2961737796979871
GarageArea - Skewness: -1.6093471440627554, Kurtosis: 4.008551742559867
GarageYrBlt - Skewness: -0.6842502586141582, Kurtosis: -0.2513785348703732
GrLivArea - Skewness: 0.469996793236193, Kurtosis: 0.5953215942036492
KitchenQual - Skewness: 0.03175894393166758, Kurtosis: -0.02710380481401531
OverallQual - Skewness: -0.29558343385175034, Kurtosis: 1.0381261862705289
TotalBsmtSF - Skewness: -1.4759676656207195, Kurtosis: 6.055496698779596
YearBuilt - Skewness: -0.6271094837568743, Kurtosis: -0.39805175852509356
YearRemodAdd - Skewness: -0.49511108812979787, Kurtosis: -1.2836540055554648
SalePrice - Skewness: 0.8512237087688703, Kurtosis: 1.6748275901451803


### Implementing Transformations
Based on the results of the different transformations.
I've opted to use these transformations:
- 1stFlrSF : Log
- GrLivArea: Log
- KitchenQual: Yeo-Johnson
- OverallQual: Yeo-Johnson
- YearBuilt: Yeo-Johnson
- YearRemodAdd: Yeo-Johnson

In [10]:
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from scipy.stats import boxcox_normmax
import numpy as np

transformers = {
    '1stFlrSF': FunctionTransformer(np.log1p, validate=True),
    'GrLivArea': FunctionTransformer(np.log1p, validate=True),
    'KitchenQual': PowerTransformer(method='yeo-johnson', standardize=True),
    'OverallQual': PowerTransformer(method='yeo-johnson', standardize=True),
    'YearBuilt': PowerTransformer(method='yeo-johnson', standardize=True),
    'YearRemodAdd': PowerTransformer(method='yeo-johnson', standardize=True)
}

for feature, transformer in transformers.items():
    TrainSet[feature] = transformer.fit_transform(TrainSet[[feature]])
    TestSet[feature] = transformer.transform(TestSet[[feature]])

## Winsoriser


In [11]:
df_engineering = TrainSet.copy()

print_results(df_engineering)

1stFlrSF - Skewness: -0.0024576178568033193, Kurtosis: 0.013805767337972785
GarageArea - Skewness: 0.1743109161644365, Kurtosis: 0.8293437119659983
GarageYrBlt - Skewness: -0.6724667403046555, Kurtosis: -0.27790902874977785
GrLivArea - Skewness: -0.06743909950588259, Kurtosis: 0.1728823696400532
KitchenQual - Skewness: -4.727914421778203e-05, Kurtosis: -0.03464860436914341
OverallQual - Skewness: 0.023258692938383306, Kurtosis: 0.27943955459651937
TotalBsmtSF - Skewness: 0.5914851324202735, Kurtosis: 2.1657171267323623
YearBuilt - Skewness: -0.13338005255024799, Kurtosis: -1.1879043745582991
YearRemodAdd - Skewness: -0.2206970248777962, Kurtosis: -1.5166647791498673
SalePrice - Skewness: 1.7501064882951105, Kurtosis: 5.6902630514780945


In [12]:
quartiles = df_engineering[df_engineering.columns].quantile([0.25, 0.75])
quartiles

Unnamed: 0,1stFlrSF,GarageArea,GarageYrBlt,GrLivArea,KitchenQual,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0.25,6.785588,336.0,1962.0,7.038782,-0.731764,-0.785748,798.75,-0.702933,-0.998599,130000.0
0.75,7.237239,576.0,2001.0,7.489691,0.78966,0.678885,1276.25,1.015993,0.988735,215000.0


Based on these results, we will winsorize these features:
- TotaltBsmtSF
- GarageArea
- GarageYrBlt

In [13]:
windsoriser = Winsorizer(capping_method='iqr', tail='both', fold=1.5,
                         variables=['TotalBsmtSF', 'GarageArea', 'GarageYrBlt'])

TrainSet = windsoriser.fit_transform(TrainSet)
TestSet = windsoriser.fit_transform(TestSet)

## Smart Correlated Selection

In [21]:
from feature_engine.selection import SmartCorrelatedSelection
corr_sel = SmartCorrelatedSelection(variables=None, method="spearman", threshold=0.8, selection_method="variance")

corr_sel.fit_transform(df_engineering)
corr_sel.correlated_feature_sets_

[{'1stFlrSF', 'TotalBsmtSF'},
 {'GarageYrBlt', 'YearBuilt'},
 {'OverallQual', 'SalePrice'}]

In [22]:
corr_sel.features_to_drop_

['1stFlrSF', 'OverallQual', 'YearBuilt']

---

# Conclusions