In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
housing = pd.read_csv('data/Ames_Housing_Price_Data.csv')

### My Columns

In [None]:
housing = housing[['SalePrice',
'BsmtFullBath',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'KitchenQual',
'TotRmsAbvGrd',
'Functional',
'Fireplaces',
'FireplaceQu',
'GarageType',
'GarageFinish',
'GarageCars',
'GarageArea',
'GarageQual',
'GarageCond',
'PavedDrive',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch']]

In [None]:
housing.sample(10)

### Data Cleanup: fillna

In [4]:
# Filling nominal NA's with 0's (no basement or garage)
housing['GarageQual'].fillna(0, inplace = True)
housing['GarageCond'].fillna(0, inplace = True)
housing['GarageCars'].fillna(0, inplace = True)
housing['GarageArea'].fillna(0, inplace = True)

# Filling categorical NA's with 'None' (no garage)
housing['GarageType'].fillna('None', inplace = True)
housing['GarageFinish'].fillna('None', inplace = True)

In [7]:
housing.isnull().any()

SalePrice        False
FullBath         False
BedroomAbvGr     False
KitchenAbvGr     False
KitchenQual      False
TotRmsAbvGrd     False
Functional       False
Fireplaces       False
FireplaceQu       True
GarageType       False
GarageFinish     False
GarageCars       False
GarageArea       False
GarageQual       False
GarageCond       False
PavedDrive       False
WoodDeckSF       False
OpenPorchSF      False
EnclosedPorch    False
3SsnPorch        False
ScreenPorch      False
TotalBath        False
dtype: bool

### Data Cleanup: Combine Bathrooms to create 'TotalBath' feature

In [6]:
# Filling Basement bathroom NA's with 0
housing['BsmtFullBath'].fillna(0, inplace = True)
housing['BsmtHalfBath'].fillna(0, inplace = True)

# Combining bathrooms to single variable and dropping the individual counts
housing['TotalBath'] = housing['BsmtFullBath'] + (housing['BsmtHalfBath']*5) + housing['FullBath'] + (housing['HalfBath']*.5)
housing.drop(['BsmtFullBath', 'BsmtHalfBath', 'HalfBath'], axis = 1, inplace = True)

- I kept the 'FullBath' feature because it had a 0.53 correlation with 'SalePrice'

### Data Cleanup: Ordinal and Categorical Features

In [8]:
#Ordinal
housing.KitchenQual.replace({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
housing.GarageQual.replace({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
housing.GarageCond.replace({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}, inplace=True)
housing.PavedDrive.replace({'N':1, 'P':2, 'Y':3}, inplace=True)

#Categorical
dummies=['Functional','GarageType','GarageFinish']

for d in dummies:
    temp = pd.get_dummies(housing[d], prefix=d, prefix_sep='__')
    temp = temp.drop(d+'__'+str(housing[d].mode()[0]), axis=1)
    #drop the most common one (mode), making it the base case
    housing = pd.concat([housing.drop(d, axis=1),temp], axis=1)
    #merge back with main df

### Multicollinearity in Garages

High correlation in:
- GarageCars | GarageArea
- GarageQual | GarageCond

In [9]:
print(f'GarageCars | GarageArea', housing['GarageCars'].corr(housing['GarageArea']))
print(f'GarageCars | GarageQual', housing['GarageCars'].corr(housing['GarageQual']))
print(f'GarageCars | GarageCond', housing['GarageCars'].corr(housing['GarageCond']))
print(f'GarageArea | GarageQual', housing['GarageArea'].corr(housing['GarageQual']))
print(f'GarageArea | GarageCond', housing['GarageArea'].corr(housing['GarageCond']))
print(f'GarageQual | GarageCond', housing['GarageQual'].corr(housing['GarageCond']))

GarageCars | GarageArea 0.8898279487178717
GarageCars | GarageQual 0.553947524952688
GarageCars | GarageCond 0.5424153436558793
GarageArea | GarageQual 0.545929045147731
GarageArea | GarageCond 0.5308307101908939
GarageQual | GarageCond 0.9409098825951635


- Dropping GarageArea/GarageCond because they are less correlated than their counterpats GarageCars/GarageQual

In [10]:
housing.drop(['GarageArea', 'GarageCond'], axis = 1, inplace = True)

### Removing Features

- 3 features were dropped.
    - 'BsmtFullBath' - .29 correlation, seems unnecessary
    - 'BsmtHalfBath' - -.03 correlation is too low
    - 'FireplaceQu' - over 1200 NA's - probably not contributing much

In [12]:
housing.drop(['FireplaceQu'], axis = 1, inplace = True)

### Checking Correlations to 'SalePrice'

In [None]:
num_vars = housing.select_dtypes(np.number)

In [None]:
correlation = num_vars.corr()
print(correlation['SalePrice'].sort_values(ascending = False), '\n')

In [None]:
housing.isnull().any()