In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load the training data from the csv
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


## Preprocessing

### Drop Missing Values

In [3]:
# Drop all columns with 250 missing values
MAX_NA_PER_COLUMN = 250

df = df.dropna(axis=1, thresh=(len(df) - MAX_NA_PER_COLUMN))

# Drop all rows with missing values
df = df.dropna(axis=0, how='any')
df.shape

(1338, 74)

### Feature Scaling

In [4]:
# Normalize all numeric features
from sklearn.preprocessing import StandardScaler

# Find all columns with numeric types
numList = df.select_dtypes(include="number").columns
print(numList)
for feat in numList:
    df[feat] = StandardScaler().fit_transform(df[[feat]])

df.head()

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-1.731951,0.093701,RL,-0.218363,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0.202294,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,0.146084,WD,Normal
1,-1.729579,-0.876298,RL,-0.107067,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,-0.731311,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-0.493561,-0.605686,WD,Normal
2,-1.727207,0.093701,RL,0.05262,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,-0.088501,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,0.988784,0.146084,WD,Normal
3,-1.724836,0.336201,RL,-0.111906,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,-0.195636,4.122511,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,-1.357457,WD,Abnorml
4,-1.722464,0.093701,RL,0.343926,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0.554309,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,2.100542,0.146084,WD,Normal


### Label Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder

# Find all columns with data type object
objList = df.select_dtypes(include="object").columns
print(objList)

# For each column with data type object, label encode that column
for feat in objList:
    df[feat] = LabelEncoder().fit_transform(df[feat].astype(str))

df.head()

Index([], dtype='object')


Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,-1.731951,0.093701,3,-0.218363,1,3,3,0,4,0,...,0.202294,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,0.146084,8,4
1,-1.729579,-0.876298,3,-0.107067,1,3,3,0,2,0,...,-0.731311,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,-0.493561,-0.605686,8,4
2,-1.727207,0.093701,3,0.05262,1,0,3,0,4,0,...,-0.088501,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,0.988784,0.146084,8,4
3,-1.724836,0.336201,3,-0.111906,1,0,3,0,0,0,...,-0.195636,4.122511,-0.118688,-0.283203,-0.071771,-0.084535,-1.605319,-1.357457,8,0
4,-1.722464,0.093701,3,0.343926,1,0,3,0,2,0,...,0.554309,-0.349612,-0.118688,-0.283203,-0.071771,-0.084535,2.100542,0.146084,8,4
