**House Prices: Advanced Regression Techniques

This project is a compeition in Kaggle with initially 79 features for the residential homes in Ames, Iowa. The goal of this challenge is to predict the final price of each home.
As usual, data exploration, feature engineering are applied, then several machine learning models are built to predict the house price in the test set.

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%matplotlib inline

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

Functions which are used in this project. They can also be reused in other projects.

In [71]:
# Returns a concatenated dataframe of training and test set on axis 0
def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

# Returns divided dfs of training and test set
def divide_df(all_data, sizeof_first):
    return all_data.loc[:sizeof_first-1,:], all_data.loc[sizeof_first:,:]    

# Onehot encoding function
def onehot_features(data, cols):
    multicols = data[cols]
    multicols = multicols.astype(str)
    one_hot = pd.get_dummies(multicols)
    return one_hot

# Feature scaling function
def scaling(data,cols):
    scaler = StandardScaler()
    scaled_data = pd.DataFrame(scaler.fit_transform(data), cols)
    return scaled_data

# Map categorical strings in the features to numbers
def map_category(series, mapping):
    digit_data = series.map(mapping)
    return digit_data

# Calculate the probability mass function for statistics
def pmf(series, b_normal):
    a = series.value_counts()
    if b_normal:
        a = a/a.sum()
    return a

Data Exploration

In [72]:
print(df_train.shape)
print(df_test.shape)

(1460, 81)
(1459, 80)


In [73]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [74]:
df_train.describe(include='all')

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460,1201.0,1460.0,1460,91,1460,1460,1460,...,1460.0,7,281,54,1460.0,1460.0,1460.0,1460,1460,1460.0
unique,,,5,,,2,2,4,4,2,...,,3,4,4,,,,9,6,
top,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,...,,Gd,MnPrv,Shed,,,,WD,Normal,
freq,,,1151,,,1454,50,925,1311,1459,...,,3,157,49,,,,1267,1198,
mean,730.5,56.89726,,70.049958,10516.828082,,,,,,...,2.758904,,,,43.489041,6.321918,2007.815753,,,180921.19589
std,421.610009,42.300571,,24.284752,9981.264932,,,,,,...,40.177307,,,,496.123024,2.703626,1.328095,,,79442.502883
min,1.0,20.0,,21.0,1300.0,,,,,,...,0.0,,,,0.0,1.0,2006.0,,,34900.0
25%,365.75,20.0,,59.0,7553.5,,,,,,...,0.0,,,,0.0,5.0,2007.0,,,129975.0
50%,730.5,50.0,,69.0,9478.5,,,,,,...,0.0,,,,0.0,6.0,2008.0,,,163000.0
75%,1095.25,70.0,,80.0,11601.5,,,,,,...,0.0,,,,0.0,8.0,2009.0,,,214000.0


In [75]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [76]:
# Extract the house prices to a new series
y_train = df_train['SalePrice']
df_train.drop('SalePrice', axis=1, inplace=True)

# Remove Id columns from both training and test set
df_train.drop('Id', axis=1, inplace=True)
df_test.drop('Id', axis=1, inplace=True)

# Check the data shape now
print(df_train.shape)
print(df_test.shape)
print(y_train.shape)

(1460, 79)
(1459, 79)
(1460,)


Check the null values in training set

In [77]:
df_nu = df_train.isnull().sum()

# Only list the features with null values for later processing
print(df_nu[df_nu > 0])

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


Check the null values in test set

In [78]:
df_nu = df_test.isnull().sum()

# Only list the features with null values for later processing
print(df_nu[df_nu > 0])

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64


There are 4 features 'Alley','PoolQC','Fence','MiscFeature' with almost all null values. Merge training and test data first then remove the 4 features.

In [79]:
# Merge training and test set to process feature engineering together
df_all = concat_df(df_train, df_test)
print(df_all.shape)

(2919, 79)


In [80]:
df_all.drop(['Alley','PoolQC','Fence','MiscFeature'], axis = 1, inplace= True)

Initially fill in the null values for all features with null values above.

In [81]:
df_all['MSZoning'].fillna('RL', inplace=True) # test set only
df_all['LotFrontage'].fillna(df_all['LotFrontage'].mean(), inplace =True)       
df_all['Utilities'].fillna('AllPub', inplace=True) # test set only
df_all['Exterior1st'].fillna('VinylSd', inplace=True) # test set only
df_all['Exterior2nd'].fillna('VinylSd', inplace=True) # test set only
df_all['MasVnrType'].fillna('None', inplace=True) 
df_all['MasVnrArea'].fillna('0', inplace=True)

df_all['BsmtQual'].fillna('TA', inplace=True)
df_all['BsmtCond'].fillna('TA', inplace=True)
df_all['BsmtExposure'].fillna('No', inplace=True)
df_all['BsmtFinType1'].fillna('GLQ', inplace=True)
df_all['BsmtFinType2'].fillna('Unf', inplace=True)
df_all['BsmtFinSF1'].fillna('0', inplace=True) # test set only
df_all['BsmtFinSF2'].fillna('0', inplace=True) # test set only
df_all['BsmtUnfSF'].fillna(df_all['BsmtUnfSF'].median(), inplace=True) # test set only
df_all['TotalBsmtSF'].fillna(df_all['BsmtUnfSF'].median(), inplace=True) # test set only
df_all['BsmtFullBath'].fillna('0', inplace=True) # test set only
df_all['BsmtHalfBath'].fillna('0', inplace=True) # test set only

df_all['Electrical'].fillna('SBrkr', inplace=True) # training set only
df_all['KitchenQual'].fillna('TA', inplace=True) # test set only
df_all['Functional'].fillna('Typ', inplace=True) # test set only
df_all['FireplaceQu'].fillna('TA', inplace=True)
       
df_all['GarageType'].fillna('Attchd', inplace=True)
df_all['GarageYrBlt'].fillna(df_all['GarageYrBlt'].mean(), inplace=True)
df_all['GarageFinish'].fillna('RFn', inplace=True)
df_all['GarageCars'].fillna('2', inplace=True) #test set only
df_all['GarageArea'].fillna(df_all['GarageArea'].mean(), inplace=True) #test set only
df_all['GarageQual'].fillna('TA', inplace=True)
df_all['GarageCond'].fillna('TA', inplace=True)
 
df_all['SaleType'].fillna('WD', inplace=True) #test set only

# Make sure there is no null items
df_all.isnull().sum()

1stFlrSF         0
2ndFlrSF         0
3SsnPorch        0
BedroomAbvGr     0
BldgType         0
BsmtCond         0
BsmtExposure     0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtFinType1     0
BsmtFinType2     0
BsmtFullBath     0
BsmtHalfBath     0
BsmtQual         0
BsmtUnfSF        0
CentralAir       0
Condition1       0
Condition2       0
Electrical       0
EnclosedPorch    0
ExterCond        0
ExterQual        0
Exterior1st      0
Exterior2nd      0
FireplaceQu      0
Fireplaces       0
Foundation       0
FullBath         0
Functional       0
GarageArea       0
                ..
LotArea          0
LotConfig        0
LotFrontage      0
LotShape         0
LowQualFinSF     0
MSSubClass       0
MSZoning         0
MasVnrArea       0
MasVnrType       0
MiscVal          0
MoSold           0
Neighborhood     0
OpenPorchSF      0
OverallCond      0
OverallQual      0
PavedDrive       0
PoolArea         0
RoofMatl         0
RoofStyle        0
SaleCondition    0
SaleType         0
ScreenPorch 

Now, handle several types of features in different ways:
1. Do one-hot encoding for categorical and norminal data
2. For categorical and ordinal data, map all values to numbers
3. Do normalization and scaling finally for all features - including both categorical data and regression data. It's not essential (optioanl) for tree related algorithms.

In [82]:
# Check the data summary so far
print(df_all.columns)
print(df_all.head())
print(df_all.shape)

Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BldgType',
       'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF',
       'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'EnclosedPorch',
       'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'FireplaceQu',
       'Fireplaces', 'Foundation', 'FullBath', 'Functional', 'GarageArea',
       'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
       'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC',
       'HouseStyle', 'KitchenAbvGr', 'KitchenQual', 'LandContour', 'LandSlope',
       'LotArea', 'LotConfig', 'LotFrontage', 'LotShape', 'LowQualFinSF',
       'MSSubClass', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscVal',
       'MoSold', 'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'PavedDrive', 'PoolArea', 'RoofMatl', 'RoofStyle', 'SaleCondition',
       

Use one-hot encoding to handle categorical + norminal features below:
MSZoning
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
Foundation
Heating
CentralAir
Electrical
Functional
GarageType
GarageFinish
MoSold
SaleType
SaleCondition


In [83]:
cols = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'MoSold', 'SaleType', 'SaleCondition']
onehot = onehot_features(df_all, cols)

# Drop the original features
df_all.drop(cols, axis = 1, inplace = True)

print(type(onehot))
print(onehot.shape)
print(df_all.shape)

df_all = pd.concat([df_all, onehot], axis=1)

print(df_all.shape)

<class 'pandas.core.frame.DataFrame'>
(2919, 191)
(2919, 48)
(2919, 239)


Map all the categorical ordianl features which the order of the values make sense and their values are not numerical below:
ExterQual
ExterCond
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
HeatingQC
KitchenQual
FireplaceQu
GarageQual
GarageCond
PavedDrive
PoolQC
Fence

The mappings can be determined by referring the data_description file which describes the meanings of each features.

In [84]:
mapping = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NA':0, 'Av':3, 'Mn':2, 'No':1}
df_all['ExterQual'] = df_all['ExterQual'].astype(str)
df_all['ExterQual'] = df_all['ExterQual'].map(mapping)
df_all['ExterCond'] = map_category(df_all['ExterCond'], mapping)
df_all['BsmtQual'] = map_category(df_all['BsmtQual'], mapping)
df_all['BsmtCond'] = map_category(df_all['BsmtCond'], mapping)
df_all['BsmtExposure'] = map_category(df_all['BsmtExposure'], mapping)
df_all['HeatingQC'] = map_category(df_all['HeatingQC'], mapping)
df_all['KitchenQual'] = map_category(df_all['KitchenQual'], mapping)
df_all['FireplaceQu'] = map_category(df_all['FireplaceQu'], mapping)
df_all['GarageQual'] = map_category(df_all['GarageQual'], mapping)
df_all['GarageCond'] = map_category(df_all['GarageCond'], mapping)

mapping = {'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'NA':0}
df_all['BsmtFinType1'] = map_category(df_all['BsmtFinType1'], mapping)
df_all['BsmtFinType2'] = map_category(df_all['BsmtFinType2'], mapping)

mapping = {'Y':2, 'P':1, 'N':0}
df_all['PavedDrive'] = map_category(df_all['PavedDrive'], mapping)

In [85]:
df_nu = df_all.isnull().sum()

# Make sure if there is no null value
print(df_nu[df_nu>0])

Series([], dtype: int64)


In [86]:
#df_all.to_csv(r'D:\ML\Kaggle projects\House Prices Advanced Regression Techniques\check None.csv')

# Check the data set dimensions so far again.
print(df_train.shape)
print(df_all.shape)
X_train, X_test = divide_df(df_all, df_train.shape[0])
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)    

(1460, 79)
(2919, 239)
(1460, 239)
(1459, 239)
(1460,)


Scaling features

In [87]:
col = X_train.columns
X_train = X_train.astype(float)
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = col)

print(X_train.head())

col = X_test.columns
X_test = X_test.astype(float)
scaler = StandardScaler()
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = col)

   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtCond  BsmtExposure  \
0 -0.793434  1.161852  -0.116339      0.163779 -0.038577     -0.631676   
1  0.257140 -0.795163  -0.116339      0.163779 -0.038577      2.256363   
2 -0.627826  1.189351  -0.116339      0.163779 -0.038577      0.331004   
3 -0.521734  0.937276  -0.116339      0.163779  3.481550     -0.631676   
4 -0.045611  1.617877  -0.116339      1.390023 -0.038577      1.293683   

   BsmtFinSF1  BsmtFinSF2  BsmtFinType1  BsmtFinType2  ...  SaleType_ConLw  \
0    0.575425   -0.288653      1.116589     -0.314282  ...       -0.058621   
1    1.171992   -0.288653      0.631549     -0.314282  ...       -0.058621   
2    0.092907   -0.288653      1.116589     -0.314282  ...       -0.058621   
3   -0.499274   -0.288653      0.631549     -0.314282  ...       -0.058621   
4    0.463568   -0.288653      1.116589     -0.314282  ...       -0.058621   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  \
0     -0.301962  

In [88]:
# TBD PCA

Discard polynomail features due to lack of data samples comparing number of features

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

poly = PolynomialFeatures(degree = 2).fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

print(X_train_poly.shape)

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_train_poly, y_train, test_size = 0.3, random_state = 42)

# Apply Ridge (L2) to polynomial features
ridge = Ridge(alpha = 1000).fit(X_train_t, y_train_t)

print('Score of Polynomial Ridge on training set', ridge.score(X_train_t, y_train_t))
print('Score of Polynomial Ridge on test set', ridge.score(X_test_t, y_test_t))

(1460, 28920)
Score of Polynomial Ridge on training set 0.9967319384286493
Score of Polynomial Ridge on dev set 0.8147640120562797


In [91]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_train, y_train, test_size = 0.3, random_state = 42)

print(X_train_t.shape)
print(X_test_t.shape)



from sklearn.linear_model import LinearRegression



#print(X_train.dtypes)

#X_train.to_csv('X_train_final.csv')
#clf = LinearRegression()
#lr = clf.fit(X_train_t, y_train_t)
#print('Score of Linear Regression ', lr.score(X_test_t, y_test_t))

# TBD Try different alpha values and plot
ridge = Ridge(alpha = 1200).fit(X_train_t, y_train_t)
print('Score of Ridge on training set', ridge.score(X_train_t, y_train_t))
print('Score of Ridge on test set', ridge.score(X_test_t, y_test_t))

(1022, 239)
(438, 239)
Score of Ridge on training set 0.8678218443191563
Score of Ridge on dev set 0.8561071282877735


Implement KNN. KNN is simple with only one parameter for tuning (k) so often be used for baseline for ML computing. However, it's not good for this case as KNN does not perform well on datasets with many features.

In [92]:
from sklearn.neighbors import KNeighborsClassifier 

clf = KNeighborsClassifier(n_neighbors = 100)
knn = clf.fit(X_train_t, y_train_t)
print('Score of knn on training set', knn.score(X_train_t, y_train_t))
print('Score of knn on test set', knn.score(X_test_t, y_test_t))

Score of knn on training set 0.043052837573385516
Score of knn on dev set 0.00684931506849315


As expected, the performace result is poor for KNN. Therefore, it won't be applied and just for reference here.

Implement Tree algorithms

In [93]:
print(X_train_t.shape)
print(X_train_t.head())

(1022, 239)
      1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtCond  BsmtExposure  \
135   1.343941 -0.795163  -0.116339      0.163779 -0.038577     -0.631676   
1452 -0.234508 -0.795163  -0.116339     -1.062465 -0.038577      2.256363   
762  -1.031495  0.999149  -0.116339      0.163779 -0.038577      0.331004   
932   1.920981 -0.795163  -0.116339      0.163779  3.481550     -0.631676   
435  -0.868475  1.116020  -0.116339      0.163779 -0.038577      1.293683   

      BsmtFinSF1  BsmtFinSF2  BsmtFinType1  BsmtFinType2  ...  SaleType_ConLw  \
135    -0.973018   -0.288653     -1.308612     -0.314282  ...       -0.058621   
1452    0.226696   -0.288653      1.116589     -0.314282  ...       -0.058621   
762    -0.920380   -0.288653      1.116589     -0.314282  ...       -0.058621   
932    -0.973018   -0.288653     -1.308612     -0.314282  ...       -0.058621   
435    -0.128612    1.844495      1.116589      4.285742  ...       17.058722   

      SaleType_New  SaleType_Oth  Sale

In [94]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(max_depth = 20, random_state = 42, n_estimators = 100)
rf = clf.fit(X_train_t, y_train_t)
print('Score of Kernel Random Forest on training set', rf.score(X_train_t, y_train_t))
print('Score of Kernel Random Forest on test set', rf.score(X_test_t, y_test_t))


Score of Kernel Random Forest on training set 0.9765000156191826
Score of Kernel Random Forest on dev set 0.8988459343986092


In [97]:
# Compare predictions on test samples
pred = rf.predict(X_test_t)
print(pred.shape)
print(pred)

print(y_test_t)

(438,)
[141655.08333333 319998.83       118422.         151426.34
 322121.11        82897.         208848.00666667 150996.
  82791.         129122.87       153173.94166667 120767.27380952
 107345.5        208274.39       178771.95909091 129440.375
 191172.83484848 135420.94642857 118305.26321429 204295.88
 162433.38285714 233576.56       173044.86909091 120455.29
 194477.49666667 166873.22       184148.195      105433.5
 180524.17909091 198176.09       123056.53       249421.63
 184394.66       112350.08035714 255148.4        149800.18333333
 138964.49285714 202156.085      297469.96       108273.
 122872.2        237198.53       120009.73809524 359802.49
 136408.6        134628.03       117088.         127890.35714286
 384766.45       143373.14       120886.6875     194083.28
 120450.47       348625.84       139841.         242274.90285714
 197972.         149649.45       146689.07       109839.
  77462.         145829.6        303019.49       279497.38
 292824.62       205145.       

In [101]:
pred = rf.predict(X_test)