# Real Estate Appraisal using Machine Learning 
> Dominik Huffield


<br>
- This data can be found at: <href>kaggle.com<href> <br>
- The data was collected by Zillow <br>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import *
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv('train-houses.csv')
train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [3]:
# Plots lower triangle heatmap
def plotcorrmatrix(df):
    sns.set(style="white")

    #Compute correlatoin matrix
    corr = df.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.show()

In [4]:
all_quant_feat = np.array(train.iloc[:, [1, 3, 4, 17, 18, 19, 20, 26, 34, 36, 37, 38, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 56, 59, 61, 62, 66, 67, 68, 69, 70, 71, 75, 76, 77, 80]].columns)
all_categ_feat = np.array(train.iloc[:, [2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79]].columns)

data_quant = train.iloc[:, [0, 1, 3, 4, 17, 18, 19, 20, 26, 34, 36, 37, 38, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 56, 59, 61, 62, 66, 67, 68, 69, 70, 71, 75, 76, 77, 80]]
data_categ = train.iloc[:, [0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79]]

In [5]:
train.set_index('Id', inplace=True)
data_quant.set_index('Id', inplace=True)
data_categ.set_index('Id', inplace=True)

In [6]:
len(all_categ_feat) == len(data_categ.columns)

True

In [7]:
len(all_quant_feat) == len(data_quant.columns)

True

In [8]:
data_quant.is_copy = False
data_categ.is_copy = False

  object.__getattribute__(self, name)
  return object.__setattr__(self, name, value)


In [9]:
data_categ.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [10]:
train.dropna(inplace=True, axis='columns')
train.dropna(inplace=True)
data_quant.dropna(inplace=True)
data_categ.dropna(inplace=True, axis='columns')

In [11]:
# data_categ.dropna(inplace=True)
data_categ.head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,TA,BrkTil,GasA,Gd,Y,Gd,Typ,Y,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal


In [12]:
null_columns = np.array(data_categ.columns)
null_columns = np.setdiff1d(all_quant_feat, null_columns)

In [13]:
# index = np.argwhere(all_categ_feat==null_columns)
# all_quant_feat = np.delete(all_quant_feat, index)
all_categ_feat = data_categ.columns

In [14]:
data_categ.head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,ExterCond,Foundation,Heating,HeatingQC,CentralAir,KitchenQual,Functional,PavedDrive,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
2,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,CBlock,GasA,Ex,Y,TA,Typ,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal
4,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,TA,BrkTil,GasA,Gd,Y,Gd,Typ,Y,WD,Abnorml
5,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,TA,PConc,GasA,Ex,Y,Gd,Typ,Y,WD,Normal


In [15]:
for col_name in all_categ_feat:
    df = pd.DataFrame()
    data_categ[col_name] = pd.Categorical(data_categ[col_name], categories=data_categ[col_name].unique())

In [16]:
data_categ_dum = pd.get_dummies(data_categ)
all_categ_dum_feat = data_categ_dum.columns

In [17]:
data_categ_dum.head(2)

Unnamed: 0_level_0,MSZoning_RL,MSZoning_RM,MSZoning_C (all),MSZoning_FV,MSZoning_RH,Street_Pave,Street_Grvl,LotShape_Reg,LotShape_IR1,LotShape_IR2,...,SaleType_CWD,SaleType_ConLw,SaleType_Con,SaleType_Oth,SaleCondition_Normal,SaleCondition_Abnorml,SaleCondition_Partial,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [18]:
data_quant_con = pd.DataFrame(RobustScaler(with_scaling=True, with_centering=True).fit_transform(data_quant), columns=all_quant_feat)

In [31]:
# data_categ_dum[data_categ_dum.sum() == 0]
index = data_categ_dum.iloc[data_categ_dum.sum() > 200]
index

# data_categ_dum.drop(columns=index, inplace=True)
# data_categ_dum.loc[(data_categ_dum!=0).any(axis=1)]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [22]:
data_categ_dum.sum()

MSZoning_RL              1151
MSZoning_RM               218
MSZoning_C (all)           10
MSZoning_FV                65
MSZoning_RH                16
Street_Pave              1454
Street_Grvl                 6
LotShape_Reg              925
LotShape_IR1              484
LotShape_IR2               41
LotShape_IR3               10
LandContour_Lvl          1311
LandContour_Bnk            63
LandContour_Low            36
LandContour_HLS            50
Utilities_AllPub         1459
Utilities_NoSeWa            1
LotConfig_Inside         1052
LotConfig_FR2              47
LotConfig_Corner          263
LotConfig_CulDSac          94
LotConfig_FR3               4
LandSlope_Gtl            1382
LandSlope_Mod              65
LandSlope_Sev              13
Neighborhood_CollgCr      150
Neighborhood_Veenker       11
Neighborhood_Crawfor       51
Neighborhood_NoRidge       41
Neighborhood_Mitchel       49
                         ... 
CentralAir_N               95
KitchenQual_Gd            586
KitchenQua

In [21]:
# X = data_quant_con.join(data_categ_dum)
X = data_categ_dum.join(data_quant_con)

In [22]:
y = train['SalePrice']
# train

In [23]:
X.dropna(inplace=True)
# y
y = y.iloc[X.index]

In [24]:
X = pd.DataFrame(SelectKBest(score_func=f_regression, k=13).fit_transform(X, y))

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [26]:
lin_model= LinearRegression().fit(X_train, y_train)
yHat= lin_model.predict(X_val)

print('R^2 for Linear Regression model: ' , r2_score(y_val, yHat))
# print('RMSLE for Linear Regression model: ', rmsle(yHat, y_val))
print('MSLE for Linear Regression model: ', mean_squared_log_error(yHat, y_val))
print('MSE for Linear Regression model:', mean_squared_error(yHat, y_val))

R^2 for Linear Regression model:  0.012187693732411908
MSLE for Linear Regression model:  0.16185654577787134
MSE for Linear Regression model: 5733528543.754704


In [27]:
forest_model = RandomForestRegressor(n_estimators=10, criterion='mse')
forest_model.fit(X_train, y_train)
forest_predicts = forest_model.predict(X_val)
forest_predicts = np.absolute(forest_predicts)
y_val = np.absolute(y_val)

In [28]:
print('\nR^2 for Random Forest Regression model: ' , r2_score(y_val, forest_predicts))
print('RMSLE for Random Forest Regression model: ', np.sqrt(mean_squared_log_error(forest_predicts, y_val)))


R^2 for Random Forest Regression model:  -0.23171128459406298
RMSLE for Random Forest Regression model:  0.44722968853164013
