# Data Cleaning and Preprocessing Notebook

From our earlier analysis in the EDA notebook, we've identified that some features in our dataset have missing values.

Let's start by checking the percentage of missing values in each feature.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

train_df = pd.read_csv('../data_details/train.csv')
test_df = pd.read_csv('../data_details/test.csv')

In [2]:
# Calculating the percentage of missing values in each feature
missing_values = train_df.isnull().mean() * 100

# Displaying features with their corresponding percentage of missing values
missing_values[missing_values > 0].sort_values(ascending=False)


PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtExposure     2.602740
BsmtFinType2     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
MasVnrType       0.547945
Electrical       0.068493
dtype: float64

Features like PoolQC, MiscFeature, Alley, Fence, and FireplaceQu are missing more than 45% of their values.

Based on the high percentage of missing values we can assume that the absence of a value might indicate the absence of the feature itself. For instance, a NaN (or missing value) in PoolQC likely means that the house doesn't have a pool. Similarly, a NaN in GarageType likely indicates that there is no garage.

So, I choose to handle these cases by replacing the missing values with 'None' or 0, indicating that the house lacks these features.

For features like LotFrontage, which represents the linear feet of street connected to the property, we can use imputation. 

In real estate and the housing industry,  in a city, lot frontage is likely to be similar for houses in the same neighborhood.

So I decide to impute missing values in LotFrontage with the median LotFrontage of the neighborhood. 

The feature Electrical has just one missing value, and so we can replace it with the mode of the column without introducing much bias.



In [3]:
# Filling missing values for features where NaN means the feature doesn't exist
for feature in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'BsmtQual',
                'MasVnrType']:
    train_df[feature].fillna('None', inplace=True)

# Filling missing values for features where NaN means zero
for feature in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
                'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
                'BsmtHalfBath', 'MasVnrArea']:
    train_df[feature].fillna(0, inplace=True)
    
# Group by neighborhood and fill in missing LotFrontage value by the median LotFrontage of the neighborhood
train_df['LotFrontage'] = train_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# For Electrical, we'll replace it with the most common value
train_df['Electrical'].fillna(train_df['Electrical'].mode()[0], inplace=True)

# Checking if there are any missing values left
train_df.isnull().sum().sum()


0

### Remove missing values from test set as well

I'll do the same thing.

In [4]:
# Calculating the percentage of missing values in each feature
missing_values = test_df.isnull().mean() * 100

# Displaying features with their corresponding percentage of missing values
missing_values[missing_values > 0].sort_values(ascending=False)


PoolQC          99.794380
MiscFeature     96.504455
Alley           92.666210
Fence           80.123372
FireplaceQu     50.034270
LotFrontage     15.558602
GarageCond       5.346127
GarageYrBlt      5.346127
GarageQual       5.346127
GarageFinish     5.346127
GarageType       5.209047
BsmtCond         3.084304
BsmtExposure     3.015764
BsmtQual         3.015764
BsmtFinType1     2.878684
BsmtFinType2     2.878684
MasVnrType       1.096642
MasVnrArea       1.028101
MSZoning         0.274160
BsmtFullBath     0.137080
BsmtHalfBath     0.137080
Functional       0.137080
Utilities        0.137080
GarageCars       0.068540
GarageArea       0.068540
TotalBsmtSF      0.068540
KitchenQual      0.068540
BsmtUnfSF        0.068540
BsmtFinSF2       0.068540
BsmtFinSF1       0.068540
Exterior2nd      0.068540
Exterior1st      0.068540
SaleType         0.068540
dtype: float64

In [5]:
# Filling missing values for features where NaN means the feature doesn't exist
for feature in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'BsmtQual',
                'MasVnrType']:
    test_df[feature].fillna('None', inplace=True)

# Filling missing values for features where NaN means zero
for feature in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
                'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
                'BsmtHalfBath', 'MasVnrArea']:
    test_df[feature].fillna(0, inplace=True)
    
# Group by neighborhood and fill in missing LotFrontage value by the median LotFrontage of the neighborhood
test_df['LotFrontage'] = test_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# For Electrical, we'll replace it with the most common value
test_df['Electrical'].fillna(test_df['Electrical'].mode()[0], inplace=True)

# Checking if there are any missing values left
test_df.isnull().sum().sum()


12

In [6]:
test_df.isnull().sum()

Id               0
MSSubClass       0
MSZoning         4
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 80, dtype: int64

In [7]:
# I'll just fill in with the mode

mode_value = test_df['MSZoning'].mode()[0]
test_df['MSZoning'].fillna(mode_value, inplace=True)

mode_value = test_df['SaleType'].mode()[0]
test_df['SaleType'].fillna(mode_value, inplace=True)

In [8]:
test_df.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 80, dtype: int64

In [9]:
# next, identify categorical values to encode

# Identifying categorical features
categorical_features = train_df.select_dtypes(include=['object']).columns

# Displaying the categorical features
categorical_features


Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

We will apply one-hot encoding to these categorical variables. This method creates binary columns for each category and is suitable for nominal variables where no such ordinal relationship exists. Some variables might seem ordinal (e.g., 'ExterQual', 'BsmtQual'), but since we don't have a specific order provided, we'll treat them as nominal.



In [10]:
# Applying one-hot encoding to categorical variables
train_data_encoded = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)
test_data_encoded = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)

# Aligning the training and test datasets to have the same columns
train_data_encoded, test_data_encoded = train_data_encoded.align(test_data_encoded, join='left', axis=1)

# Filling any new missing values with 0 (these might occur due to one-hot encoding if a category is missing in the test dataset)
test_data_encoded.fillna(0, inplace=True)

# Checking the shape of the encoded datasets
train_data_encoded.shape, test_data_encoded.shape

((1460, 261), (1459, 261))

The one-hot encoding has been successfully applied to the categorical variables, resulting in 261 columns in both the training and test datasets.



## Feature engineering

Its time to begin to create new features, but an interesting decision I made,

I decided to proceed with creating new features before scaling so I could use the Encoded Data for Feature Engineering instead of the already scaled data.

Why?

By using the raw (encoded) data, we can create features that have interpretable values. For instance, the OverallQual_GrLivArea interaction term directly multiplies the quality rating with the living area, providing a clear interpretation of the interaction.
After feature engineering, we would typically scale the data again to ensure all features, including the new ones, have consistent scales.

If we used the scaled datasets for feature engineering, the new features would be based on standardized values, which might be less interpretable.

I thought it was better to perform feature engineering before scaling at all so that any new features created have a clear and direct interpretation. 

Then after engineering, the data, including the new features, can be scaled to prepare it for modeling.

So, moved my scaling and normalization below the feature engineering to follow a logical sequence of preprocessing -> feature engineering -> scaling -> modeling..

In [14]:
# Create new features based on EDA and domain knowledge

# from the features in our data set, there are a lot of area related features so we make a list to store them all and make an aggregate 

# List of area-related features in the dataset
area_features = [
    "LotArea", "GrLivArea", "GarageArea", "1stFlrSF", "2ndFlrSF", 
    "TotalBsmtSF", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", 
    "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", 
    "ScreenPorch", "PoolArea"
]

# List of porch-related features in the dataset

porch_features = ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']


In [15]:

# Total Area
train_data_encoded['TotalArea'] = train_data_encoded[area_features].sum(axis=1)
test_data_encoded['TotalArea'] = test_data_encoded[area_features].sum(axis=1)

# Age of House at Sale
train_data_encoded['AgeAtSale'] = train_data_encoded['YrSold'] - train_data_encoded['YearBuilt']
test_data_encoded['AgeAtSale'] = test_data_encoded['YrSold'] - test_data_encoded['YearBuilt']

# Total Bathrooms
train_data_encoded['TotalBath'] = train_data_encoded['FullBath'] + 0.5*train_data_encoded['HalfBath'] + train_data_encoded['BsmtFullBath'] + 0.5*train_data_encoded['BsmtHalfBath']
test_data_encoded['TotalBath'] = test_data_encoded['FullBath'] + 0.5*test_data_encoded['HalfBath'] + test_data_encoded['BsmtFullBath'] + 0.5*test_data_encoded['BsmtHalfBath']

# Total Porch Area
train_data_encoded['TotalPorchSF'] = train_data_encoded[porch_features].sum(axis=1)
test_data_encoded['TotalPorchSF'] = test_data_encoded[porch_features].sum(axis=1)

# Presence of Pool
train_data_encoded['HasPool'] = train_data_encoded['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
test_data_encoded['HasPool'] = test_data_encoded['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

# Remodeling Indicator
train_data_encoded['Remodeled'] = (train_data_encoded['YearBuilt'] != train_data_encoded['YearRemodAdd']).astype(int)
test_data_encoded['Remodeled'] = (test_data_encoded['YearBuilt'] != test_data_encoded['YearRemodAdd']).astype(int)

# Interaction feature between the overall quality and living area
train_data_encoded['OverallQual_GrLivArea'] = train_data_encoded['OverallQual'] * train_data_encoded['GrLivArea']
test_data_encoded['OverallQual_GrLivArea'] = test_data_encoded['OverallQual'] * test_data_encoded['GrLivArea']

# Displaying the head of the encoded training data after feature creation
train_data_encoded.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,TotalArea,AgeAtSale,TotalBath,TotalPorchSF,HasPool,Remodeled,OverallQual_GrLivArea
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,1,0,14237.0,5,3.5,61,0,0,11970
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,1,0,15122.0,31,2.5,298,0,0,7572
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,1,0,17040.0,7,3.5,42,0,1,12502
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,14905.0,91,2.0,307,0,1,12019
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,1,0,21918.0,8,3.5,276,0,0,17584


## Normalize Numerical Variables


In the context of our Ames Housing dataset, with its high-dimensional input space and our new features, some preprocessing steps such as feature scaling become essential. The dataset contains a mix of numerical features, each with varying scales. Algorithms that rely on distances or gradients, such as Linear Regression, Support Vector Machines, or those with regularization, can be heavily influenced by the magnitude of features. By ensuring that all features operate on a consistent scale, we eliminate any undue influence a larger scale feature might exert, thereby ensuring a balanced and fair contribution from all features during the modeling process. Furthermore, when it comes to optimization techniques like gradient descent, scaling aids in achieving faster and more stable convergence. As our objective includes experimenting with multiple regression algorithms and neural networks, and given the necessity of feature extraction highlighted in the brief, scaling acts as a foundational step. Not only does it improve the performance and stability of many algorithms, but it also aligns with best practices, ensuring that our models' results are both reliable and interpretable.

### Why do we exclude the target variable from scaling?

When we scale features, our objective is to ensure that the predictors have a consistent influence on the model, preventing any one feature from exerting undue influence solely due to its scale. However, the target variable, 'SalePrice' in our dataset, is what we're trying to predict, and its scale is inherently meaningful. By altering its scale, we risk losing the direct interpretability of our model's predictions.

tl:dr - Keeping 'SalePrice' in its original scale ensures that our model predictions are immediately relevant and interpretable, aligning with the objective of building a model that provides direct insights into house prices.

In [16]:
from sklearn.preprocessing import StandardScaler

# Separating the target variable
y_train = train_data_encoded['SalePrice']
train_data_encoded.drop('SalePrice', axis=1, inplace=True)

# Scaling only the common features between training and test datasets
common_features = train_data_encoded.columns.intersection(test_data_encoded.columns)

# Scaling the numerical variables for common features
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data_encoded[common_features])
test_data_scaled = scaler.transform(test_data_encoded[common_features])

# Checking the first few rows to ensure scaling has been applied
pd.DataFrame(train_data_scaled, columns=common_features).head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,TotalArea,AgeAtSale,TotalBath,TotalPorchSF,HasPool,Remodeled,OverallQual_GrLivArea
0,-1.730865,0.073375,-0.231877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,...,-0.117851,0.467651,-0.305995,-0.150528,-1.043259,1.642256,-0.768375,-0.069409,-0.95446,0.442827
1,-1.728492,-0.872563,0.437043,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,...,-0.117851,0.467651,-0.305995,-0.067924,-0.183465,0.368581,0.745011,-0.069409,-0.95446,-0.405394
2,-1.72612,0.073375,-0.098093,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,...,-0.117851,0.467651,-0.305995,0.111099,-0.977121,1.642256,-0.889702,-0.069409,1.047712,0.545431
3,-1.723747,0.309859,-0.45485,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,...,-0.117851,-2.138345,-0.305995,-0.088178,1.800676,-0.268257,0.802481,-0.069409,1.047712,0.452277
4,-1.721374,0.073375,0.615421,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,...,-0.117851,0.467651,-0.305995,0.566402,-0.944052,1.642256,0.604528,-0.069409,-0.95446,1.525572


In [18]:
# Saving the scaled training data
pd.DataFrame(train_data_scaled, columns=common_features).to_csv("../data_details/train_data_scaled.csv", index=False)

# Saving the scaled test data
pd.DataFrame(test_data_scaled, columns=common_features).to_csv("../data_details/test_data_scaled.csv", index=False)

# Saving the target variable
y_train.to_csv("../data_details/y_train.csv", index=False)
