# Feature Engineering Notebook

## Objectives

- Engineer features for Regression and Decision Tree models

## Inputs

- outputs/datasets/cleaned/TrainSetCleaned.csv
- outputs/datasets/cleaned/TestSetCleaned.csv

## Outputs

- generate a list with variables to engineer

---

## Change working directory
Change current working directory to its parent

In [None]:
import os 
cwd = os.getcwd()
cwd

In [None]:
os.chdir(os.path.dirname(cwd))
print("You set a new current working directory")

In [None]:
cwd = os.getcwd()
cwd

---

## Load cleaned data

In [61]:
import pandas as pd
train_set_path = "outputs/datasets/cleaned/TrainSetCleaned.csv"
TrainSet = pd.read_csv(train_set_path)
TrainSet.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GrLivArea,...,SalePrice,HouseAge,RemodAge,GarageAge,TotalSF,AboveGradeSF,IsRemodeled,Has2ndFlr,HasPorch,HasDeck
0,1828,0.0,3.0,Av,48,Missing,1774,774,Unf,1828,...,314813,18,18,18.0,3650.0,1828.0,0,0,0,0
1,894,0.0,2.0,No,0,Unf,894,308,Missing,894,...,109500,63,63,63.0,1788.0,894.0,0,0,0,0
2,964,0.0,2.0,No,713,ALQ,163,432,Unf,964,...,163500,104,19,104.0,1840.0,964.0,1,0,0,0


In [62]:
test_set_path = "outputs/datasets/cleaned/TestSetCleaned.csv"
TestSet = pd.read_csv(test_set_path)
TestSet.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GrLivArea,...,SalePrice,HouseAge,RemodAge,GarageAge,TotalSF,AboveGradeSF,IsRemodeled,Has2ndFlr,HasPorch,HasDeck
0,2515,0.0,4.0,No,1219,Rec,816,484,Missing,2515,...,200624,68,50,50.0,4550.0,2515.0,1,0,0,0
1,958,620.0,3.0,No,403,BLQ,238,240,Unf,1578,...,133000,84,75,84.0,2384.0,1578.0,1,1,0,0
2,979,224.0,3.0,No,185,LwQ,524,352,Unf,1203,...,110000,75,75,75.0,1912.0,1203.0,0,1,0,0


---

## Feature Engineering

Defining variables

OverallCond and OverallQual are categorical variables also, but they are already encoded

In [63]:
ordinal_vars = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']
numeric_vars = [
    '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtUnfSF',
    'GarageArea', 'GrLivArea', 'LotArea', 'LotFrontage', 'MasVnrArea',
    'OpenPorchSF', 'TotalBsmtSF', 'HouseAge', 'RemodAge', 'GarageAge',
    'TotalSF', 'AboveGradeSF'
]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from feature_engine import transformation as vt
import pandas as pd

def NumericFEAnalysis(df, var):
    df_temp = pd.DataFrame({var: df[var].copy()})

    transformers = {
        'Original': None, 
        'Log (base e)': vt.LogTransformer(variables=[var]),
        'Log (base 10)': vt.LogTransformer(variables=[var], base='10'),
        'Reciprocal': vt.ReciprocalTransformer(variables=[var]),
        'Power': vt.PowerTransformer(variables=[var]),
        'BoxCox': vt.BoxCoxTransformer(variables=[var]),
        'YeoJohnson': vt.YeoJohnsonTransformer(variables=[var])
    }

    n_transforms = len(transformers)
    fig, axs = plt.subplots(n_transforms, 3, figsize=(15, 5*n_transforms))

    if n_transforms == 1:
        axs = [axs]
    
    for i, (name, transformer) in enumerate(transformers.items()):
        if transformer is None:
            data_trans = df_temp[var].dropna()
        else:
            try:
                data_trans = transformer.fit_transform(df_temp[[var]])[var].dropna()
            except Exception as e:
                print(f"Transformer {name} failed on {var}: {e}")
                continue
        
        # Histogram
        sns.histplot(data_trans, kde=True, ax=axs[i][0])
        axs[i][0].set_title(f'{var} - {name} Histogram')

        # Q-Q plot
        stats.probplot(data_trans, dist="norm", plot=axs[i][1])
        axs[i][1].set_title(f'{var} - {name} Q-Q Plot')

        # Boxplot
        sns.boxplot(x=data_trans, ax=axs[i][2])
        axs[i][2].set_title(f'{var} - {name} Boxplot')

    plt.tight_layout()
    plt.show()


for num_var in numeric_vars:
    NumericFEAnalysis(TrainSet, num_var)

Conclusions
- 1stFlrSf, GrLivArea, TotalSF and AboveGrSF should be numerically transformed, details in summary sheet. 
- Binary flags (IsRemodeled, Has2ndFlr, HasPorch, HasDeck) don't require encoding.
- Categorical variables (KitchenQual, BsmtExposure, BsmtFinType1, GarageFinish) require ordinal encoding.

### Determine whether to handle outliers or discretise numeric variables

In [64]:
import pandas as pd
import numpy as np

def check_winsor_discretisation(df, numeric_vars, skew_threshold=1.0):
    summary = []

    for var in numeric_vars:
        data = df[var].dropna()

        skewness = data.skew()

        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        upper_whisker = q3 + 1.5 * iqr
        max_value = data.max()

        if skewness > skew_threshold or max_value > upper_whisker * 3:
            winsor_flag = "Yes"
            reason_winsor = f"High skewness ({skewness:.2f}) or extreme max ({max_value:.0f} > {upper_whisker * 3:.0f})"
        else:
            winsor_flag = "No"
            reason_winsor = "Skewness and outliers not extreme"

        if skewness > skew_threshold:
            disc_flag = "Yes"
            reason_disc = f"High skewness ({skewness:.2f})"
        else:
            disc_flag = "No"
            reason_disc = "Distribution roughly symmetric"

        summary.append({
            "Variable": var,
            "Skewness": round(skewness, 2),
            "Max": max_value,
            "Should Winsorize": winsor_flag,
            "Reason Winsorize": reason_winsor,
            "Should Discretize": disc_flag,
            "Reason Discretize": reason_disc
        })

    return pd.DataFrame(summary)


In [None]:
summary_df = check_winsor_discretisation(TrainSet, numeric_vars)
summary_df

### Creating pipeline 

In [65]:
from sklearn.pipeline import Pipeline
from feature_engine.transformation import LogTransformer, PowerTransformer
from feature_engine.outliers import Winsorizer
from feature_engine.discretisation import EqualWidthDiscretiser 

# not using OrdinalEncoder since the order of categories matters, defining manual mappings

TrainSet_copy = TrainSet.copy()

mapping_KitchenQual = {'Ex':4, 'Gd':3, 'TA':2, 'Fa':1}
mapping_BsmtExposure = {'Gd':5, 'Av':4, 'Mn':3, 'No':2, 'Missing':1}
mapping_BsmtFinType1 = {'GLQ':7, 'ALQ':6, 'BLQ':5, 'Rec':4, 'LwQ':3, 'Unf':2, 'Missing':1}
mapping_GarageFinish = {'Fin':4, 'RFn':3, 'Unf':2, 'Missing':1}


TrainSet_copy['KitchenQual_enc'] = TrainSet_copy['KitchenQual'].map(mapping_KitchenQual)
TrainSet_copy['BsmtExposure_enc'] = TrainSet_copy['BsmtExposure'].map(mapping_BsmtExposure)
TrainSet_copy['BsmtFinType1_enc'] = TrainSet_copy['BsmtFinType1'].map(mapping_BsmtFinType1)
TrainSet_copy['GarageFinish_enc'] = TrainSet_copy['GarageFinish'].map(mapping_GarageFinish)

TrainSet_copy = TrainSet_copy.drop(columns=['KitchenQual', 'BsmtExposure', 'BsmtFinType1', 'GarageFinish'])

In [66]:
TestSet_copy = TestSet.copy()

mapping_KitchenQual = {'Ex':4, 'Gd':3, 'TA':2, 'Fa':1}
mapping_BsmtExposure = {'Gd':5, 'Av':4, 'Mn':3, 'No':2, 'Missing':1}
mapping_BsmtFinType1 = {'GLQ':7, 'ALQ':6, 'BLQ':5, 'Rec':4, 'LwQ':3, 'Unf':2, 'Missing':1}
mapping_GarageFinish = {'Fin':4, 'RFn':3, 'Unf':2, 'Missing':1}


TestSet_copy['KitchenQual_enc'] = TestSet_copy['KitchenQual'].map(mapping_KitchenQual)
TestSet_copy['BsmtExposure_enc'] = TestSet_copy['BsmtExposure'].map(mapping_BsmtExposure)
TestSet_copy['BsmtFinType1_enc'] = TestSet_copy['BsmtFinType1'].map(mapping_BsmtFinType1)
TestSet_copy['GarageFinish_enc'] = TestSet_copy['GarageFinish'].map(mapping_GarageFinish)

TestSet_copy = TestSet_copy.drop(columns=['KitchenQual', 'BsmtExposure', 'BsmtFinType1', 'GarageFinish'])

In [68]:
winsor_vars = ['GrLivArea', 'LotArea', 'LotFrontage', 'MasVnrArea', 'OpenPorchSF', 'AboveGradeSF']
log_vars = ['1stFlrSF', 'GrLivArea', 'AboveGradeSF']
power_vars = ['TotalSF']

winsorizer = Winsorizer(
    capping_method='iqr',
    tail='both',
    fold=1.5,
    variables=winsor_vars
)

log_transformer = LogTransformer(variables=log_vars)
power_transformer = PowerTransformer(variables=power_vars)

discretiser = EqualWidthDiscretiser(bins=5, variables=winsor_vars)

numerical_pipeline = Pipeline([
    ('winsorizer', winsorizer),
    ('log_transformer', log_transformer),
    ('power_transformer', power_transformer),
    ('discretiser', discretiser)
])

numerical_pipeline.fit(TrainSet_copy)

TrainSet_transformed = numerical_pipeline.transform(TrainSet_copy)
TestSet_transformed = numerical_pipeline.transform(TestSet_copy)


In [69]:
TrainSet_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   1stFlrSF          1168 non-null   float64
 1   2ndFlrSF          1168 non-null   float64
 2   BedroomAbvGr      1168 non-null   float64
 3   BsmtFinSF1        1168 non-null   int64  
 4   BsmtUnfSF         1168 non-null   int64  
 5   GarageArea        1168 non-null   int64  
 6   GrLivArea         1168 non-null   int64  
 7   LotArea           1168 non-null   int64  
 8   LotFrontage       1168 non-null   int64  
 9   MasVnrArea        1168 non-null   int64  
 10  OpenPorchSF       1168 non-null   int64  
 11  OverallCond       1168 non-null   int64  
 12  OverallQual       1168 non-null   int64  
 13  TotalBsmtSF       1168 non-null   int64  
 14  SalePrice         1168 non-null   int64  
 15  HouseAge          1168 non-null   int64  
 16  RemodAge          1168 non-null   int64  


---

## Saving outputs

In [70]:
from pathlib import Path

out_dir = Path("outputs/datasets/cleaned")
out_dir.mkdir(parents=True, exist_ok=True)

TrainSet_transformed.to_csv(out_dir / "TrainSet_FeatureEngineered.csv", index=False)
TestSet_transformed.to_csv(out_dir / "TestSet_FeatureEngineered.csv", index=False)