In [1]:
import pandas as pd

df = pd.read_csv('../data/raw/train.csv')

In [2]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [3]:
processedDF = df.drop(columns=['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'GarageCars', 'TotRmsAbvGrd', 'GarageYrBlt'])

Dropping:

Id because it is meaningless to any analysis. 

PoolQC, MiscFeature, Alley, Fence, and MasVnrType for the large amount of Nans.

GarageCars, TotRmsAbvGrd, and GarageYrBlt for the high collinearity with GarageArea, GrLivArea, and YearBuilt respectively.

# Checking for nulls and filling missing values with the median

In [4]:
processedDF.isna().sum().sort_values(ascending=False)

FireplaceQu     690
LotFrontage     259
GarageCond       81
GarageType       81
GarageFinish     81
               ... 
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtSF       0
MSZoning          0
SalePrice         0
Length: 72, dtype: int64

In [5]:
cat_cols = processedDF.select_dtypes(include=["object", "category"]).columns.tolist()
processedDF[cat_cols] = processedDF[cat_cols].astype("object").fillna("N/A")

# converts boolean columns to integers
bool_cols = processedDF.select_dtypes(include="bool").columns
processedDF[bool_cols] = processedDF[bool_cols].astype(int)

# fills numeric missing values
processedDF["LotFrontage"] = processedDF["LotFrontage"].fillna(processedDF["LotFrontage"].median())
processedDF["MasVnrArea"] = processedDF["MasVnrArea"].fillna(processedDF["MasVnrArea"].median())

In [6]:
processedDF.isna().sum().sort_values(ascending=False)

MSSubClass     0
MSZoning       0
Fireplaces     0
Functional     0
KitchenQual    0
              ..
MasVnrArea     0
Exterior2nd    0
Exterior1st    0
RoofMatl       0
SalePrice      0
Length: 72, dtype: int64

In [7]:
processedDF.to_csv('../data/filtered/processedData.csv', index=False)