### Data Preparation - Handling Missing Values

Dataset: Housing_train.csv

Importing libraries:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


%matplotlib inline

In [None]:
# Set Options for display
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.2f}'.format

#Filter Warnings
import warnings
warnings.filterwarnings('ignore')

Load the dataset:

In [None]:
df = pd.read_csv("../datasets/Housing_train.csv", index_col = 'Id')

Handle missing data

In [None]:
#missing data

#Gets the total number of missing data
total = df.isnull().sum().sort_values(ascending=False)

#Get % of Null
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

missing_data.head(20)

What should we do with "PoolQC"?

In [None]:
df.PoolQC.value_counts()

In [None]:
df_temp = df.fillna({'PoolQC':'None'},inplace=False)

In [None]:
df_temp.groupby("PoolQC")['SalePrice'].mean()

Keep PoolQC and Rows with PoolQC, as it informs price. Make change permanent on Data Frame

In [None]:
df.fillna({'PoolQC':'None'}, inplace=True)

What should we do with "MiscFeature? Assume all missing values mean there are no misc features for the property

In [None]:
df  = df.fillna({'MiscFeature':'None'})

Inspect if the rows tdf_temp= df.fillna({'FireplaceQu':'None'})hat are missing for FireplaceQu are the ones without Fireplaces

In [None]:
df_temp= df.fillna({'FireplaceQu':'None'})

In [None]:
df_temp.FireplaceQu.value_counts()

In [None]:
pd.crosstab(df_temp.Fireplaces, df_temp.FireplaceQu)

In [None]:
df.fillna({'FireplaceQu':'None'}, inplace=True)

In [None]:
df.fillna({'FireplaceQu':'None'}, inplace = True)

Assume we know Alley and Fence doesn't inform price, drop the columns

In [None]:
df.drop(columns = ['Alley', 'Fence'], axis=1,  inplace = True )

In [None]:
df.shape

Fill in with imputed value (mean) for LotFrontage

In [None]:
df.LotFrontage.describe()

In [None]:
LF_mean = df.LotFrontage.mean()

In [None]:
df.LotFrontage.fillna(LF_mean,inplace=True)

Fill in LotFrontage with imputed value based on LotArea

Let's try using some factor of LotArea

In [None]:
df_temp['PercentLotArea']=df_temp['LotArea']*.45
df_temp['LotFrontage'].corr(df_temp['PercentLotArea'])

Let's try to see if using the square root of Lot Area would yield higher correlation

In [None]:
df_temp['SqrtLotArea']=np.sqrt(df_temp['LotArea'])
df_temp['LotFrontage'].corr(df_temp['SqrtLotArea'])

In [None]:
df['SqrtLotArea']=np.sqrt(df['LotArea'])

Check Basement Features

Do the blank categorical features correspond to 0 in the numerical features?

In [None]:
#List of Basement Features
basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']

In [None]:
df_temp[basement_cols][df_temp['BsmtQual'].isnull()==True]

So in the cases where the categorical variables are NaN, the numerical ones are 0. Which means there's no basement, so the categorical ones should also be set to "None".

In [None]:
#For loop to fill in missing values
for col in basement_cols:
    
    if df[col].dtype==np.object:
        df[col] = df[col].fillna('None')
        
    else:
        df[col] = df[col].fillna(0)

Check Garage Features

Note: Not all Garage Features have the same number of missing rows

In [None]:
sns.violinplot(x=df["GarageCars"],y=df["SalePrice"])

No Garage appears to inform SalePrice, fill in with None for categorical and 0 for numeric SF

In [None]:
#List of Garage Features
garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish']

#For loop to fill in missing values
for col in garage_cols:
    
    if df[col].dtype==np.object:
        df[col] = df[col].fillna('None')
        
    else:
        df[col] = df[col].fillna(0)

How to handle Electrical with just one row missing?

In [None]:
#We can replace missing values with most frequent ones.
df_temp = df.copy()
df_temp["Electrical"] = df["Electrical"].fillna('SBrkr')

In [None]:
#OR We can drop the row altogether
df.dropna(subset = ["Electrical"], inplace=True, axis = 0, how='all')

In [None]:
df.shape

Handle the remaining columns with null values

In [None]:
#Check if we still have null values present
df.isnull().sum().sort_values(ascending=False).head()

In [None]:
#Let's drop the rows in this case since we have less than 10 rows with missing values left
df.dropna(subset = ["MasVnrType","MasVnrArea"], axis = 0, how='any', inplace=True)

In [None]:
#Confirm that there are no more null values present
df.isnull().sum().sort_values(ascending=False).head()