#### Imports

In [41]:
import pandas as pd

In [42]:
data = pd.read_csv('../data/train.csv', index_col='Id')
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


##### Handling missing values

In [49]:
print("; ".join(data.isnull().mean().round(4).mul(100).sort_values(ascending=False).to_string().split("\n")))

MasVnrType       0.55; MasVnrArea       0.55; Electrical       0.07; MSSubClass       0.00; FullBath         0.00; KitchenQual      0.00; KitchenAbvGr     0.00; BedroomAbvGr     0.00; HalfBath         0.00; BsmtHalfBath     0.00; Heating          0.00; BsmtFullBath     0.00; GrLivArea        0.00; LowQualFinSF     0.00; 2ndFlrSF         0.00; 1stFlrSF         0.00; CentralAir       0.00; TotRmsAbvGrd     0.00; Functional       0.00; Fireplaces       0.00; ScreenPorch      0.00; SaleCondition    0.00; SaleType         0.00; YrSold           0.00; MoSold           0.00; MiscVal          0.00; PoolArea         0.00; 3SsnPorch        0.00; GarageYrBlt      0.00; EnclosedPorch    0.00; OpenPorchSF      0.00; WoodDeckSF       0.00; PavedDrive       0.00; GarageArea       0.00; GarageCars       0.00; HeatingQC        0.00; TotalBsmtSF      0.00; MSZoning         0.00; YearBuilt        0.00; LotArea          0.00; Street           0.00; LotShape         0.00; LandContour      0.00; Utilities  

PoolQC, MiscFeature, Alley, Fence, FireplaceQu, LotFrontage have a high percentage of missing values so we're dropping them.
GarageCond, GarageType, GarageFinish, GarageQual have connected missing values - they're missing when there's no garage on the property *and* are extremely correlated with GarageCars, GarageArea, so we can safely drop them instead of filling them.

In [44]:
to_drop = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'GarageFinish', 'GarageQual', 'GarageCond', 'GarageType']
data.drop(labels=to_drop, axis=1, inplace=True)

When the property is missing a basement there's no entry in the data for the basement columns. We're just going to fill them with 'No Basement'. I assume it's the same for the masonry type and area since their missing values are correlated and we're going to fill them with 'None' and 0. The year when the Garage is built is probably missing when there's no garage so we're filling it with the year the house was built instead.

In [48]:
data['BsmtFinType2'] = data['BsmtFinType2'].fillna('No Basement')
data['BsmtExposure'] = data['BsmtExposure'].fillna('No Basement')
data['BsmtCond'] = data['BsmtCond'].fillna('No Basement')
data['BsmtFinType1'] = data['BsmtFinType1'].fillna('No Basement')
data['BsmtQual'] = data['BsmtQual'].fillna('No Basement')
data['']
data.loc[data['GarageYrBlt'].isna(), 'GarageYrBlt'] = data['YearBuilt'][data['GarageYrBlt'].isna()]

##### Separating the numerical and categorical cols

In [None]:
numerical = [col for col in data.columns if data[col].dtype in ('Int64', 'Float64')]
categorical = [col for col in data.columns if data[col].dtype not in ('Int64', 'Float64')]
print((numerical, categorical))

(['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'], ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition'])


  numerical = [col for col in data.columns if data[col].dtype in ('Int64', 'Float64')]
  categorical = [col for col in data.columns if data[col].dtype not in ('Int64', 'Float64')]
