# Data Preprocessing for Ames Iowa Housing Dataset

## Goals
- Clean missing values
- Encode categorical variables
- Transform quantitative variables

## Imports and Data Loading

In [104]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [105]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [106]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [107]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## Fill Missing Values

This will initially be done just by using mean/median, 0, or None depending on which is more appropriate

In [108]:
def preprocess_data(train, test):
    # Copy the train and test DataFrames to avoid changing the original data
    train_processed = train.copy().drop(columns=['Id'])
    test_processed = test.copy().drop(columns=['Id'])

    # Fill missing values for categorical variables in train
    train_processed['Electrical'] = train_processed['Electrical'].fillna(train_processed['Electrical'].mode()[0])
    train_processed['Alley'] = train_processed['Alley'].fillna('None')
    train_processed['MiscFeature'] = train_processed['MiscFeature'].fillna('None')
    train_processed['Fence'] = train_processed['Fence'].fillna('None')
    train_processed['LotFrontage'] = train_processed['LotFrontage'].fillna(0)

    # Apply custom transformations based on conditions for train data
    train_processed['PoolQC'] = train_processed.apply(lambda row: validity_check(row, 'PoolArea', 'PoolQC', train_processed), axis=1)
    train_processed['FireplaceQu'] = train_processed.apply(lambda row: validity_check(row, 'Fireplaces', 'FireplaceQu', train_processed), axis=1)
    train_processed['MasVnrArea'] = train_processed['MasVnrArea'].fillna(0)
    train_processed['MasVnrType'] = train_processed.apply(lambda row: masonry_check(row, train_processed), axis=1)
    
    # Apply validity check for basement columns in train data
    train_processed['BsmtCond'] = train_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtCond', train_processed), axis=1)
    train_processed['BsmtQual'] = train_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtQual', train_processed), axis=1)
    train_processed['BsmtExposure'] = train_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtExposure', train_processed), axis=1)
    train_processed['BsmtFinType1'] = train_processed.apply(lambda row: validity_check(row, 'BsmtFinSF1', 'BsmtFinType1', train_processed), axis=1)
    train_processed['BsmtFinType2'] = train_processed.apply(lambda row: validity_check(row, 'BsmtFinSF1', 'BsmtFinType2', train_processed), axis=1)

    # Apply validity check for garage columns in train data
    train_processed['GarageQual'] = train_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageQual', train_processed), axis=1)
    train_processed['GarageCond'] = train_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageCond', train_processed), axis=1)
    train_processed['GarageFinish'] = train_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageFinish', train_processed), axis=1)
    train_processed['GarageType'] = train_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageType', train_processed), axis=1)
    train_processed['GarageYrBlt'] = train_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageYrBlt', train_processed, median=True), axis=1)

    # Now process the test data using the same transformations
    test_processed['Electrical'] = test_processed['Electrical'].fillna(train_processed['Electrical'].mode()[0])  # Use train mode
    test_processed['Alley'] = test_processed['Alley'].fillna('None')
    test_processed['MiscFeature'] = test_processed['MiscFeature'].fillna('None')
    test_processed['Fence'] = test_processed['Fence'].fillna('None')
    test_processed['LotFrontage'] = test_processed['LotFrontage'].fillna(0)

    #Additional NaN values present in test
    test_processed['GarageArea'] = test_processed['GarageArea'].fillna(0)
    test_processed['GarageCars'] = test_processed['GarageCars'].fillna(0)
    test_processed['GarageYrBlt'] = test_processed['GarageYrBlt'].fillna(0)
    test_processed['BsmtFinSF1'] = test_processed['BsmtFinSF1'].fillna(0)
    test_processed['BsmtFinSF2'] = test_processed['BsmtFinSF2'].fillna(0)
    test_processed['BsmtUnfSF'] = test_processed['BsmtUnfSF'].fillna(0)
    test_processed['TotalBsmtSF'] = test_processed['TotalBsmtSF'].fillna(0)
    test_processed['BsmtFullBath'] = test_processed['BsmtFullBath'].fillna(0)
    test_processed['BsmtHalfBath'] = test_processed['BsmtHalfBath'].fillna(0)
    test_processed['Exterior1st'] = test_processed['Exterior1st'].fillna(train_processed['Exterior1st'].mode()[0])
    test_processed['Exterior2nd'] = test_processed['Exterior2nd'].fillna(train_processed['Exterior2nd'].mode()[0])
    test_processed['KitchenQual'] = test_processed['KitchenQual'].fillna(train_processed['KitchenQual'].mode()[0])
    test_processed['SaleType'] = test_processed['SaleType'].fillna(train_processed['SaleType'].mode()[0])
    test_processed['Utilities'] = test_processed['Utilities'].fillna(train_processed['Utilities'].mode()[0])
    test_processed['Functional'] = test_processed['Functional'].fillna(train_processed['Functional'].mode()[0])
    test_processed['MSZoning'] = test_processed['MSZoning'].fillna(train_processed['MSZoning'].mode()[0])

    # Apply custom transformations to the test data using train values for consistency
    test_processed['PoolQC'] = test_processed.apply(lambda row: validity_check(row, 'PoolArea', 'PoolQC', train_processed), axis=1)
    test_processed['FireplaceQu'] = test_processed.apply(lambda row: validity_check(row, 'Fireplaces', 'FireplaceQu', train_processed), axis=1)
    test_processed['MasVnrArea'] = test_processed['MasVnrArea'].fillna(0)
    test_processed['MasVnrType'] = test_processed.apply(lambda row: masonry_check(row, train_processed), axis=1)
    
    # Apply validity check for basement columns in test data
    test_processed['BsmtCond'] = test_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtCond', train_processed), axis=1)
    test_processed['BsmtQual'] = test_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtQual', train_processed), axis=1)
    test_processed['BsmtExposure'] = test_processed.apply(lambda row: validity_check(row, 'TotalBsmtSF', 'BsmtExposure', train_processed), axis=1)
    test_processed['BsmtFinType1'] = test_processed.apply(lambda row: validity_check(row, 'BsmtFinSF1', 'BsmtFinType1', train_processed), axis=1)
    test_processed['BsmtFinType2'] = test_processed.apply(lambda row: validity_check(row, 'BsmtFinSF1', 'BsmtFinType2', train_processed), axis=1)

    # Apply validity check for garage columns in test data
    test_processed['GarageQual'] = test_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageQual', train_processed), axis=1)
    test_processed['GarageCond'] = test_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageCond', train_processed), axis=1)
    test_processed['GarageFinish'] = test_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageFinish', train_processed), axis=1)
    test_processed['GarageType'] = test_processed.apply(lambda row: validity_check(row, 'GarageArea', 'GarageType', train_processed), axis=1)

    return train_processed, test_processed

def validity_check(row, check_col, target_col, train, median=False):
    """
    If check_col != 0 and target_col is missing, fill with median of target_col.
    If check_col == 0 and target_col is missing, fill with 'None'.
    Otherwise, keep original value.
    """
    if pd.isna(row[target_col]):
        if row[check_col] != 0:
            if median:
                return train[target_col].median()
            else:
                return train[target_col].mode()[0]
        else:
            return 'None'
    return row[target_col] 

def masonry_check(row, train):
    if pd.isna(row['MasVnrType']):
        if row['MasVnrArea'] <= 10:
            return 'None'
        else:
            return train['MasVnrType'].mode()[0]
    return row['MasVnrType'] 

train_processed, test_processed = preprocess_data(train, test)

In [109]:
train_processed.isna().any().sum(), test_processed.isna().any().sum()

(0, 0)

### Notes
- I am unsure about the way I have procesed missing values for *LotFrontage*
    - Statistical testing showed correlation between *LotArea* and *LotFrontage*
    - Statistical testing showed statisical significant difference in means for *LotFrontage* based on *Neighborhood*
    - For now I have assumed the no lot frontage means no direct access to public roads, but it is not possible to confirm or deny this withou accessing additional geographic information
    - It would be useful to test model performance with different ways of filling null values
- There were many more columns in test dataset that had missing values, no special processing was done for these and based on their type a 0 or mode was used to fill missing value

## Addressing Outliers
Target variable has many outliers which may affect a model's ability to accurately predict across the range of values. These shall be removed from the training set.

## Addressing Skew
In the EDA it was discovered that all features including target were highly skewed. For some models this greatly hinders the ability to make predictions. Features with high skew will be transformed to make them more normally distributed

In [115]:
train_processed[train_processed['SalePrice'] > train_processed['SalePrice'].max() * 0.65]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
178,20,RL,63.0,17423,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,,,,0,7,2009,New,Partial,501837
440,20,RL,105.0,15431,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2009,WD,Normal,555000
691,60,RL,104.0,21535,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,1,2007,WD,Normal,755000
769,60,RL,47.0,53504,Pave,,IR2,HLS,AllPub,CulDSac,...,0,,,,0,6,2010,WD,Normal,538000
803,60,RL,107.0,13891,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,1,2009,New,Partial,582933
898,20,RL,100.0,12919,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,3,2010,New,Partial,611657
1046,60,RL,85.0,16056,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,7,2006,New,Partial,556581
1169,60,RL,118.0,35760,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,,,,0,7,2006,WD,Normal,625000
1182,60,RL,160.0,15623,Pave,,IR1,Lvl,AllPub,Corner,...,555,Ex,MnPrv,,0,7,2007,WD,Abnorml,745000


## Ordinal Encoded Features
Based on analysis of a data description file provided with the dataset, there are many categorical features that have an inherint order to them and can be appropriately label encoded. The features are as follows,
- OverallQual
- OverallCond
- ExterQual
- ExterCond
- BsmtQual (references height of basement so might not be appropriate)
- BsmtCond
- HeatingQC
- KitchenQual
- FireplaceQu (references type of fireplace so might not be appropriate)
- GarageQual
- GarageCond
- PoolQc

## Label Encoded Features
Some features are binary, these can be appropriately label encoded
- Street -> isPaved (Gravel, Paved)
- CentralAir -> isCentralAir (Yes, No)

## OHE Features
The rest of the categorical features were one hot encoded

In [99]:
def encode_cat_features(train, test):
    qual_cond_mapping = {
        'None': 0,
        "Po": 1, 
        "Fa": 2, 
        "TA": 3, 
        "Gd": 4, 
        "Ex": 5
    }

    ordinal_features = [ 
        "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", 
        "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQc"
    ]

    for col in ordinal_features:
        if col in train.columns:
            train[col] = train[col].map(qual_cond_mapping)
            test[col] = test[col].map(qual_cond_mapping)

    train['isPaved'] = train['Street'].apply(lambda x: 1 if x == 'Pave' else 0)
    train['isCentralAir'] = train['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0)
    train.drop(columns=['Street', 'CentralAir'], inplace=True)

    test['isPaved'] = test['Street'].apply(lambda x: 1 if x == 'Pave' else 0)
    test['isCentralAir'] = test['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0)
    test.drop(columns=['Street', 'CentralAir'], inplace=True)

    cat_features = train.select_dtypes(include=['O']).columns.difference(ordinal_features + ['isPaved', 'isCentralAir'])
    train = pd.get_dummies(train, columns=cat_features, drop_first=True).astype(int)
    test = pd.get_dummies(test, columns=cat_features, drop_first=True).astype(int)

    train, test = train.align(test, join='left', axis=1, fill_value=0)

    return train, test

In [100]:
t, tt = encode_cat_features(train_processed, test_processed)