# Housing Sale Price Prediction Program
- Load Libraires and Data
- Data Cleaning
- Modelling

In [3]:
# data structure
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt 
import seaborn as sb 

# modelling
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.model_selection import train_test_split # data split
from sklearn.linear_model import LinearRegression # OLS algorithm
from sklearn.linear_model import Ridge # Ridge algorithm
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # ElasticNet algorithm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn.metrics import r2_score as r2 # evaluation metric
from sklearn.ensemble import RandomForestRegressor

In [4]:
df_train = pd.read_csv("Data Science ZExercise_TRAINING_CONFIDENTIAL1.csv")
df_test = pd.read_csv("Data Science ZExercise_TEST_CONFIDENTIAL2.csv")

## Data Cleaning

In [5]:
# Sale Price: create a log value to see which one fits better
df_train['ln_Sale']=np.log(df_train['SaleDollarCnt'])

# Impute or compliment missing values
# 1. Garage: replace missing with 0, and create a new dummy "garage" to indicate
# 2. View: convert to dummies, including missing
# 3. BG-level missing values: replace with mean of all BG 
        # (can be later impute with K-nearest neighboring BG)
# 4. Zoning codes: convert to dummies
df_train['Garage'] = 0
df_train.loc[df_train['GarageSquareFeet'].isnull()==False,'Garage']=1
df_train.loc[df_train['Garage']==0,'GarageSquareFeet']=0
df_train=df_train.join(pd.get_dummies(df_train['ViewType'],dummy_na=True,prefix='view'))
df_train.loc[df_train['BGMedHomeValue'].isnull()==True, 'BGMedHomeValue'] = df_train['BGMedHomeValue'].mean()
df_train.loc[df_train['BGMedRent'].isnull()==True, 'BGMedRent'] = df_train['BGMedRent'].mean()
df_train.loc[df_train['BGMedYearBuilt'].isnull()==True, 'BGMedYearBuilt'] = df_train['BGMedYearBuilt'].mean()
df_train=df_train.join(pd.get_dummies(df_train['ZoneCodeCounty'],dummy_na=True,prefix='zoning'))

## Modelling
1. Data Preparation: pre-processing
2. Linear models: OLS, Lasso, Ridge, EN
3. Non-parametric: Random Forest Regression

In [197]:
# prepare y variables
y1 = ['SaleDollarCnt']
y2 = ['ln_Sale']

# prepare X variables
X1 = ['BedroomCnt', 'BathroomCnt', 'StoryCnt', 'BuiltYear',
          'FinishedSquareFeet','GarageSquareFeet','LotSizeSquareFeet', # Home Characteristics
            'BGMedHomeValue', 'BGMedRent', 'BGMedYearBuilt',
            'BGPctOwn', 'BGPctVacant', 'BGMedIncome',
            'BGPctKids', 'BGMedAge'] # BG Characteristics

# dummies
X2 = ['Garage','view_78.0', 'view_79.0', 'view_82.0', 'view_241.0',
            'view_244.0', 'view_246.0', 'view_247.0']

# polynomials
polyset = ['FinishedSquareFeet','GarageSquareFeet','LotSizeSquareFeet']
X3=[]
for i in polyset:
    for j in range(2,6):
        st = i + "_" + str(j)
        st_std = st+"std"
        df_train[st]=df_train[i]**j
        df_train[st_std]=( df_train[st]-df_train[st].mean() )/df_train[st].std()
        X3.append(st_std)

# discontinuity
#BGMedHomeValue==10**6, BGMedRent==1250, BGMedRent==2000, BGMedYearBuilt==1940, BGPctVacant = 0
df_train['BGMedHomeValue10e6']=0
df_train.loc[df_train['BGMedHomeValue']==10**6,'BGMedHomeValue10e6']=1
df_train['BGMedRent1250']=0
df_train.loc[df_train['BGMedRent']==1250,'BGMedRent1250']=1
df_train['BGMedRent2000']=0
df_train.loc[df_train['BGMedRent']==2000,'BGMedRent2000']=1
df_train['BGMedYearBuilt1940']=0
df_train.loc[df_train['BGMedYearBuilt']==1940,'BGMedYearBuilt1940']=1
df_train['BGPctVacant0']=0
df_train.loc[df_train['BGPctVacant']==0,'BGPctVacant0']=1
X4=['BGMedHomeValue10e6','BGMedRent1250','BGMedRent2000','BGMedYearBuilt1940','BGPctVacant0']

# interaction
ita_set = ['BedroomCnt','BathroomCnt', 'StoryCnt', 'BuiltYear','FinishedSquareFeet','LotSizeSquareFeet',
           'BGMedHomeValue', 'BGMedRent', 'BGMedYearBuilt','BGPctOwn','BGPctKids','BGMedAge']
X5=[]
df_ita = df_train['PropertyID']
df_ita=df_ita.to_frame(name='PropertyID2')
for i in range(len(ita_set)):
    for j in range(i+1,len(ita_set)):
        if i!=j:
            st = ita_set[i] + "X" + ita_set[j]
            st_std = st+"std"
            df_temp=df_train[ita_set[i]]*df_train[ita_set[j]]
            df_tempstd=(df_temp-df_temp.mean())/df_temp.std()
            df_ita.join( df_tempstd.to_frame(name=st_std))
            X5.append(st_std)

# constant term for intercept
df_train['const']=1

In [199]:
# standardize all non-dummy variables in X1
X1_df = df_train[X1]
std_X1_df=(X1_df-X1_df.mean())/X1_df.std()
# rename variable names
names,names_std=std_X1_df.columns,[]
for i in range(len(names)):
    names_std.append(names[i]+'_std')
std_X1_df.set_axis(names_std, axis=1, inplace=True)

In [205]:
model_df = df_train[y1+y2+['const']+X2+X3+X4+X5+X_zoning].join(std_X1_df)
X_var = model_df.drop(y1+y2,axis=1)
y_var1,y_var2 = model_df[y1],model_df[y2]

In [206]:
trainX,testX,trainy1,testy1=train_test_split(X_var,y_var1,test_size=0.3,random_state=42)

In [207]:
trainX.shape

(8111, 285)

In [208]:
# Modelling - OLS, Lasso, Ridge, Elastic Net
ols=LinearRegression().fit(trainX, trainy1)
lasso = linear_model.LassoCV(max_iter=1000,random_state=42).fit(trainX, trainy1.values.ravel())
ridge = linear_model.RidgeCV(alphas=[0.01,0.1,1,2,5,10,100]).fit(trainX, trainy1.values.ravel())
elasticnet = linear_model.ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.99,1],
                                      max_iter=1000,random_state=42).fit(trainX, trainy1.values.ravel())

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [209]:
# Error Comparison
print('ols',ols.score(testX,testy1))
print('lasso',lasso.score(testX,testy1))
print('ridge',ridge.score(testX,testy1))
print('elastic net',elasticnet.score(testX,testy1)) # same result as LASSO

error_ols = abs(ols.predict(testX)-testy1.values.ravel())/testy1.values.ravel()
print("OLS error mean:",error_ols.mean(),"; error median:",np.median(error_ols))
error_lasso = abs(lasso.predict(testX)-testy1.values.ravel())/testy1.values.ravel()
print("Lasso error mean:",error_lasso.mean(),"; error median:",np.median(error_lasso))
error_ridge = abs(ridge.predict(testX)-testy1.values.ravel())/testy1.values.ravel()
print("Ridge error mean:",error_ridge.mean(),"; error median:",np.median(error_ridge))

ols -12349.402365546879
lasso 0.7901775843612568
ridge 0.4710381961522013
elastic net 0.7901775843612568
OLS error mean: 2.7509005382978433 ; error median: 0.44961549112398486
Lasso error mean: 0.17380746918487983 ; error median: 0.13274545563125598
Ridge error mean: 0.1764495565652604 ; error median: 0.1282023292911462


In [210]:
trainX,testX,trainy2,testy2=train_test_split(X_var,y_var2,test_size=0.3,random_state=42)

In [232]:
# Modelling - OLS, Lasso, Ridge, Elastic Net
ols=LinearRegression().fit(trainX, trainy2)
lasso = linear_model.LassoCV(max_iter=10000,random_state=42).fit(trainX, trainy2.values.ravel())
ridge = linear_model.RidgeCV(alphas=[0.01,0.1,1,2,5,10,100]).fit(trainX, trainy2.values.ravel())
elasticnet = linear_model.ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.9,.95,.99,1],
                                      max_iter=10000,random_state=42).fit(trainX, trainy2.values.ravel())

In [212]:
# Error Comparison
print('ols',ols.score(testX,testy2))
print('lasso',lasso.score(testX,testy2))
print('ridge',ridge.score(testX,testy2))
print('elastic net',elasticnet.score(testX,testy2)) # same result as LASSO

from math import e
error_ols = abs(e**ols.predict(testX)-e**testy2.values.ravel())/e**testy2.values.ravel()
print("OLS error mean:",error_ols.mean(),"; error median:",np.median(error_ols))
error_lasso = abs(e**lasso.predict(testX)-e**testy2.values.ravel())/e**testy2.values.ravel()
print("Lasso error mean:",error_lasso.mean(),"; error median:",np.median(error_lasso))
error_ridge = abs(e**ridge.predict(testX)-e**testy2.values.ravel())/e**testy2.values.ravel()
print("Ridge error mean:",error_ridge.mean(),"; error median:",np.median(error_ridge))

  error_ols = abs(e**ols.predict(testX)-e**testy2.values.ravel())/e**testy2.values.ravel()


ols -6279.859458412306
lasso 0.861155760824634
ridge 0.41355642585828367
elastic net 0.861155760824634
OLS error mean: inf ; error median: 0.44011490051807434
Lasso error mean: 0.15272644029144553 ; error median: 0.11704712639929189
Ridge error mean: 0.14967469060814773 ; error median: 0.11266683119495896


## Random Forest Regression

In [215]:
# Fitting Random Forest Regression to the dataset
# import the regressor
  
 # create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# fit the regressor with x and y data
regressor.fit(trainX, trainy2.values.ravel())  

  regressor.fit(trainX, trainy2)


RandomForestRegressor(random_state=0)

In [216]:
Y_pred = regressor.predict(testX)

In [229]:
rf_error = abs(e**Y_pred - e**testy2.values.ravel())/e**testy2.values.ravel()

In [230]:
rf_error.mean()

0.1396987915206984

In [231]:
np.median(rf_error)

0.10196993750745657