In [410]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [424]:
df = pd.read_csv('train.csv')

In [412]:
vars_ = pd.read_csv('vars.csv')

In [425]:
print(vars_[vars_['type'] == 'ordinal']['name'].tolist())
print(vars_[vars_['type'] == 'nominal']['name'].tolist())
print(vars_[vars_['type'] == 'binary']['name'].tolist())
print(vars_[vars_['type'] == 'continuous']['name'].tolist())
print(vars_[vars_['type'] == 'discrete']['name'].tolist())
print(vars_[vars_['type'] == 'date']['name'].tolist())

['PavedDrive', 'Functional', 'OverallQual', 'KitchenQual', 'GarageCond', 'HeatingQC']
['MSSubClass', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Electrical', 'SaleCondition', 'MSZoning']
['CentralAir', 'Basement', 'Fireplaces', 'Garage', 'Reno', 'OneFamHome', 'PosFeat', 'Norm', 'TwoFloors']
['LotFrontage', 'LotArea', 'GrLivArea', 'WoodDeckSF', 'Total.Porch', 'PoolArea']
['Total.HalfBath', 'BedroomAbvGr', 'Kitchen', 'TotRmsAbvGrd', 'Total.FullBath', 'GarageCars']
['YearBuilt', 'YrSold']


## Imputing NaN im Numerical Features

In [426]:
#Imputing NA's with 0, as 0 represent no street connected to the lot
df[['LotFrontage']] = df[['LotFrontage']].fillna(0)

## Imputing NaN - Nominal Features

In [427]:
#Electrical
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(df[['Electrical']])
df[['Electrical']] = pd.DataFrame(imp.transform(X), columns = ['Electrical'])

#GarageType (impute + convert to binary)
df[['GarageType']] = df[['GarageType']].fillna('N')
df['Garage'] = (df['GarageType'] != 'N').astype(int)

## Rearranging Quantitative features

In [428]:
#Total porch area
df['Total.Porch']= df['OpenPorchSF']+ df['EnclosedPorch'] + df['ScreenPorch']

#Total rooms
df['TotRmsAbvGrd'] = df['TotRmsAbvGrd'] - df['BedroomAbvGr']

#Total Baths
df['Total.HalfBath'] = df['BsmtFullBath'] + df['FullBath']
df['Total.FullBath'] = df['BsmtHalfBath'] + df['HalfBath']

# subtracting num of bedrooms from total number of rooms as bedrooms are represented in a separate feature
df['Total.RmsAbvGrd'] = df['TotRmsAbvGrd'] - df['BedroomAbvGr']

## Converting Nominal Features

In [429]:
#Rename one 'MSZoning' value for dummification
df.loc[(df['MSZoning'] == 'C (all)'), 'MSZoning'] = 'C'

#Converting numerical to categorical
df = df.replace({'MSSubClass': {20: 'SC20', 30: 'SC30', 40: 'SC40', 45: 'SC45', 50: 'SC50', 60: 'SC60', 70: 'SC70', 
                                75: 'SC75', 80: 'SC80', 85: 'SC85', 90: 'CS90', 120: 'SC120', 150: 'SC150', 160: 'SC160',
                                180: 'SC180', 190: 'SC190'} })

## Converting Y/N Features to Binary

In [430]:
#convert CentralAir to boolean
df['CentralAir'] = (df['CentralAir'] != 'N').astype(int)

#convert Fireplaces to boolean
df.loc[(df['Fireplaces'] > 0), 'Fireplaces'] = 1

# Create a binary feature 'Basement' that indicates if the home has a basement
df['Basement'] = [1 if row > 0 else 0 for row in df['TotalBsmtSF']]

# YearRemodAdd
# if the year listed in both columns is the same, there has not been a renovation
# Turn into binary variable "Reno"
Year_temp = df.YearRemodAdd - df.YearBuilt
df['Reno'] = [0 if x == 0 else 1 for x in Year_temp]

# Conditions 1 + 2
# Only "Positive Feature" really seems to matter here -> binarize
PosFeat1 = [1 if 'Pos' in x else 0 for x in df['Condition1']]
PosFeat1 = pd.Series(PosFeat1)
PosFeat2 = [1 if 'Pos' in x else 0 for x in df['Condition2']]
PosFeat2 = pd.Series(PosFeat2)
temp = pd.DataFrame()
temp['PosFeat1'] = PosFeat1
temp['PosFeat2'] = PosFeat2
temp['PosFeat'] = temp.PosFeat1 + temp.PosFeat2
# binarize and put into df
df['PosFeat'] = [1 if x > 0 else 0 for x in temp['PosFeat']]
# Just in Case:
# Also binarize "Norm" col
# if Cond1 = norm, so does Cond2, so we only need to look @ one col
df['Norm'] = [1 if 'Norm' in x else 0 for x in df['Condition1']]

df['TwoFloors'] = [1 if "2" in x else 0 for x in df['HouseStyle']]
# 1 = the house has 2 floors (or more)
# 0 = the house has 1 floor

df['OneFamHome'] = [1 if row == "1Fam" else 0 for row in df['BldgType']]

## Converting Ordinal Features

In [431]:
quality_scale = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}

df = df.replace({'HeatingQC': quality_scale,
                 'KitchenQual': quality_scale,
                 'PavedDrive': {'N': 0, 'Y': 2, 'P': 1},
                 'Functional': {'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                                'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}
})

## Converting Date Features

In [432]:
df['YearBuilt'] = pd.DatetimeIndex(df['YearBuilt']).date
df['YrSold'] = pd.DatetimeIndex(df['YrSold']).date

## Selected Features

In [433]:
ordinal_features = ['PavedDrive', 'Functional', 'OverallQual', 'KitchenQual', 'HeatingQC']
nominal_features = ['MSSubClass', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 
                    'Neighborhood', 'Electrical', 'SaleCondition', 'MSZoning']
binary_features = ['CentralAir', 'Basement', 'Fireplaces', 'Garage', 'Reno', 'OneFamHome', 
                   'PosFeat', 'Norm', 'TwoFloors']
contin_features = ['LotFrontage', 'LotArea', 'GrLivArea', 'WoodDeckSF', 'Total.Porch', 'PoolArea']
discrete_features = ['Total.HalfBath', 'Total.FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GarageCars']
date_features = ['YearBuilt', 'YrSold']

## Checking Selected Variables (if in df and data types)

In [434]:
combined_list = ordinal_features+nominal_features+binary_features+contin_features+discrete_features+date_features
print('Number of selected features: ', len(combined_list))
for i in sorted(combined_list):
    print(i, type(df[i][0]))

Number of selected features:  36
Basement <class 'numpy.int64'>
BedroomAbvGr <class 'numpy.int64'>
CentralAir <class 'numpy.int32'>
Electrical <class 'str'>
Fireplaces <class 'numpy.int64'>
Functional <class 'numpy.int64'>
Garage <class 'numpy.int32'>
GarageCars <class 'numpy.int64'>
GrLivArea <class 'numpy.int64'>
HeatingQC <class 'numpy.int64'>
KitchenQual <class 'numpy.int64'>
LandContour <class 'str'>
LandSlope <class 'str'>
LotArea <class 'numpy.int64'>
LotConfig <class 'str'>
LotFrontage <class 'numpy.float64'>
MSSubClass <class 'str'>
MSZoning <class 'str'>
Neighborhood <class 'str'>
Norm <class 'numpy.int64'>
OneFamHome <class 'numpy.int64'>
OverallQual <class 'numpy.int64'>
PavedDrive <class 'numpy.int64'>
PoolArea <class 'numpy.int64'>
PosFeat <class 'numpy.int64'>
Reno <class 'numpy.int64'>
SaleCondition <class 'str'>
TotRmsAbvGrd <class 'numpy.int64'>
Total.FullBath <class 'numpy.int64'>
Total.HalfBath <class 'numpy.int64'>
Total.Porch <class 'numpy.int64'>
TwoFloors <class

## Checking for Missingness in Selected Variables

In [435]:
for i in combined_list:
    print(i, df[i].isna().sum())

PavedDrive 0
Functional 0
OverallQual 0
KitchenQual 0
HeatingQC 0
MSSubClass 0
LandContour 0
Utilities 0
LotConfig 0
LandSlope 0
Neighborhood 0
Electrical 0
SaleCondition 0
MSZoning 0
CentralAir 0
Basement 0
Fireplaces 0
Garage 0
Reno 0
OneFamHome 0
PosFeat 0
Norm 0
TwoFloors 0
LotFrontage 0
LotArea 0
GrLivArea 0
WoodDeckSF 0
Total.Porch 0
PoolArea 0
Total.HalfBath 0
Total.FullBath 0
BedroomAbvGr 0
TotRmsAbvGrd 0
GarageCars 0
YearBuilt 0
YrSold 0


## Checking Correlation

In [436]:
import matplotlib.pyplot as plt

corr = df[combined_list].corr()
corr.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,PavedDrive,Functional,OverallQual,KitchenQual,HeatingQC,CentralAir,Basement,Fireplaces,Garage,Reno,OneFamHome,PosFeat,Norm,TwoFloors,LotFrontage,LotArea,GrLivArea,WoodDeckSF,Total.Porch,PoolArea,Total.HalfBath,Total.FullBath,BedroomAbvGr,TotRmsAbvGrd,GarageCars
PavedDrive,1.0,0.084285,0.229393,0.213023,0.157502,0.337516,0.137638,0.154082,0.327581,-0.21517,0.00923,0.040516,0.10928,-0.003734,0.016838,0.015134,0.080195,0.114857,-0.074655,0.019903,0.179088,0.115811,-0.032227,0.016593,0.284598
Functional,0.084285,1.0,0.145436,0.107003,0.058352,0.091402,0.079304,0.00787,0.063584,-0.104535,-0.005462,0.003218,-0.028782,0.036173,0.00456,-0.025128,-0.068202,0.008046,-0.026301,0.016283,0.026341,0.051389,-0.033789,-0.037999,0.086819
OverallQual,0.229393,0.145436,1.0,0.673331,0.457083,0.272038,0.210177,0.438143,0.261968,-0.080468,0.035872,0.069424,0.099223,0.283999,0.176561,0.105806,0.593007,0.238923,0.16731,0.065166,0.493116,0.230918,0.101676,0.4974,0.600671
KitchenQual,0.213023,0.107003,0.673331,1.0,0.504228,0.25783,0.124338,0.291603,0.213937,-0.012616,0.03285,0.042705,0.124077,0.167593,0.127478,0.067864,0.420563,0.221091,0.121448,0.062354,0.441739,0.150996,-0.018012,0.391552,0.50981
HeatingQC,0.157502,0.058352,0.457083,0.504228,1.0,0.306294,0.128914,0.169052,0.127154,0.014207,0.053658,-0.041995,0.091466,0.134382,0.103589,0.003581,0.254644,0.13086,0.033838,-0.047629,0.310337,0.087204,-0.026869,0.235723,0.325347
CentralAir,0.337516,0.091402,0.272038,0.25783,0.306294,1.0,0.204832,0.200814,0.251471,-0.17632,0.085294,-0.003606,0.064502,-0.010784,-0.011683,0.049755,0.093666,0.145973,-0.049093,0.018122,0.163512,0.13959,0.007872,0.04038,0.233726
Basement,0.137638,0.079304,0.210177,0.124338,0.128914,0.204832,1.0,0.091777,0.094187,0.031747,0.116622,0.022548,0.037157,0.072621,0.01271,0.019733,0.069417,0.090401,0.076028,0.011077,0.081262,0.128228,0.005655,0.014534,0.078075
Fireplaces,0.154082,0.00787,0.438143,0.291603,0.169052,0.200814,0.091777,1.0,0.202086,0.035521,0.124285,0.112363,0.005903,0.151115,0.04672,0.182827,0.445299,0.221505,0.171601,0.065026,0.274431,0.217703,0.105721,0.36645,0.349501
Garage,0.327581,0.063584,0.261968,0.213937,0.127154,0.251471,0.094187,0.202086,1.0,-0.104156,0.11048,0.03389,0.042676,0.062605,0.017052,0.074015,0.151015,0.105785,0.023633,0.016648,0.115397,0.106287,-0.006672,0.121084,0.573287
Reno,-0.21517,-0.104535,-0.080468,-0.012616,0.014207,-0.17632,0.031747,0.035521,-0.104156,1.0,0.153226,0.00652,-0.090368,-0.018244,0.041838,0.004274,0.079804,-0.040115,0.109132,-0.025956,-0.101805,-0.068158,0.006657,0.088825,-0.13936
