In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

In [2]:
# load training dataset
train_df = pd.read_csv('dataset/kaggle-house-price/train.csv')
# separate predictor variables and target variable
y = train_df['SalePrice']
X_raw = train_df.drop('SalePrice', axis=1)
# shape of training dataset
print "shape:", X_raw.shape, "\n"
# null values

def null_values_per_col(df):
    """
    Returns a Pandas Series containing the number of null values for each
    column which has atleast 1 null value.
    """
    null_cols_all = df.apply(lambda x: x.size - x.count())
    null_cols = pd.Series()

    for col, val in null_cols_all.iteritems():
        if val > 0:
            null_cols[col] = val

    return null_cols

print "null values:"
null_values_per_col(X_raw)

shape: (1460, 80) 

null values:


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

**Fill null values 'NA' wherever possible.**

In [3]:
NA_cols = ['Alley', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
           'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'BsmtFinType2',
           'BsmtFinType1', 'BsmtExposure', 'BsmtCond', 'BsmtQual']

# fill with 'NA'
for col in NA_cols:
    X_raw[col].fillna('NA', inplace=True)

# remaining columns with null values
null_values_per_col(X_raw)

LotFrontage    259
MasVnrType       8
MasVnrArea       8
Electrical       1
GarageYrBlt     81
dtype: int64

**Fill rest of the null values with appropriate values.**

In [4]:
X_raw['Electrical'].fillna(X_raw['Electrical'].mode()[0], inplace=True)
X_raw['MasVnrType'].fillna('None', inplace=True)
X_raw['MasVnrArea'].fillna(0, inplace=True)
X_raw['LotFrontage'].fillna(X_raw['LotFrontage'].mean(), inplace=True)

# remaining columns with null values
null_values_per_col(X_raw)

GarageYrBlt    81
dtype: int64

**Encode categories using LabelEncoder**

In [5]:
# Encode categories
only_strings = X_raw.select_dtypes(include=['object'])
only_strings = only_strings.apply(LabelEncoder().fit_transform)
X_raw[only_strings.columns.values] = only_strings
X_raw.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,3,65.0,8450,1,1,3,3,0,...,0,0,3,4,1,0,2,2008,8,4
1,2,20,3,80.0,9600,1,1,3,3,0,...,0,0,3,4,1,0,5,2007,8,4
2,3,60,3,68.0,11250,1,1,0,3,0,...,0,0,3,4,1,0,9,2008,8,4
3,4,70,3,60.0,9550,1,1,0,3,0,...,0,0,3,4,1,0,2,2006,8,0
4,5,60,3,84.0,14260,1,1,0,3,0,...,0,0,3,4,1,0,12,2008,8,4
