#### load libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate, GridSearchCV, train_test_split
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 
import statistics
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = None
pd.options.display.max_rows = None
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### load and inspect data

In [2]:
house_df = pd.read_csv('./data/final_df.csv', index_col = 0)

# Checking final_df after generating geoCode, found duplicated observations
house_df.shape

(2603, 179)

In [3]:
# drop duplicated observations
house_df = house_df.drop_duplicates(subset=['PID'],keep = 'first')
house_df.shape

(2558, 179)

#### preprocessing

In [4]:
house_df_use = house_df.copy()

# convert MSSubClass to str
house_df_use['MSSubClass'] = house_df_use[['MSSubClass']].astype('str')

# characterize tax distr and sch distr
house_df_use['TxD_S'] = house_df_use[['TxD_S']].astype('str')
house_df_use['SchD_S'] = house_df_use[['SchD_S']].astype('str')

# convert NA's to 0s in LotFrontage and Alley
house_df_use['LotFrontage'] = np.where(pd.isnull(house_df_use['LotFrontage']), 0, house_df_use['LotFrontage'])
house_df_use['Alley'] = np.where(pd.isnull(house_df_use['Alley']), 0, 1)

# Impute None to Nan value in columns
none_features = ['FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','Fence','BsmtQual','BsmtCond']
for feature in none_features:
    house_df_use[feature] = house_df_use[feature].fillna('None')

# drop rows with NA garage car / garage area
garage_NA_index = house_df_use.loc[pd.isnull(house_df_use['GarageArea'])].index
house_df_use = house_df_use.drop(garage_NA_index,axis=0)

# add in proper bathroom numbers for 918 and 2328
house_df_use.at[918, 'num_bathroom'] = 1
house_df_use.at[2328, 'num_bathroom'] = 3.5

# remove unnecessary columns
house_df_use = house_df_use.drop(
    ['PA-PreD','PA-PostD','PA-UnTyp','PA-UntNo','X1TPr_D','X1TSc_D','Rcrd_Mo','Legal_Pr','TxD_S',\
    'X2TPr_D','X2TSc_D','X1TPr_S','X1TSc_S','X2TPr_S','X2TSc_S','ISU_lat_long','address','MA_Ownr1','MA_Ownr2',\
    'MA_Line1','MA_Line2','MA_City','MA_State','address3','location2','point2','SaleCond','Source','Date',\
    'ParType','BldgNo_S','DwlgNo_S','YrBuilt','Ext1','Ext2','GLA','GarYrBlt','Cars','MA_Zip1','MA_Zip2',\
    'ZngCdPr','ZngCdSc','ZngOLPr','ZngOLSc','PA-Nmbr','PA-Strt','PA-StSfx','Inst1_No','Inst1_Yr','Inst1_Mo',\
    'Inst1TPr','TtlVal_AsrYr','ValType','OthAc_S','ImpAc_S','LndAc_S','Prop_Addr','HSTtl_D','MilVal_D',\
    'HSTtl_S','MilVal_S','GeoRefNo','Tier','Range','AcreX_S1','ClassPr_S','ClassSc_S','LndAcX1S','ImpAcX1S',\
    'Central Air','ImpAcX2S','AcreGr','AcreNt_S','ParclRel','Rcrd_Yr','address2','SaleType',\
    'latitude2','longitude2','ISU_lat','ISU_long','altitude2','Central Air',\

    'index','MiscVal','YrSold_YYYY','MoSold_MM','PoolArea','PoolQC','MiscFeature','Street','PID','Utilities',\
    'BsmtHalfBath2','FullBath','HalfBath','TtlBsmtSF','HalfBath2','BsmtFullBath','BsmtHalfBath',\
    'BsmtFinSF1','BsmtFinSF2','BsmtFinType1','BsmtFinType2','BsmtUnfSF','LowQualFinSF','BasmtFinSF2','BasmtFinSF1',\
    'GrLivArea','1stFlrSF','2ndFlrSF','MasVnrType','MasVnrArea','GarageArea2','PoolArea2','NmbrBRs',\
    'Neighborhood','BedroomAbvGr','TotalBsmtSF','GarageYrBlt','BasmtFinSF','Heating'],\
    axis=1)

print(house_df_use.columns)
print(f'has null: {house_df_use.columns[house_df_use.isnull().sum() > 0]}')

Index(['SalePrice', 'Distance', 'MSSubClass', 'MSZoning', 'LotFrontage',
       'LotArea', 'Alley', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'CentralAir',
       'Electrical', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'Fence', 'MoSold', 'YrSold', 'SaleCondition',
       'MasVnrArea2', 'total_LivArea', 'num_bathroom', 'SchD_S'],
      dtype='object')
has null: Index(['BsmtExposure', 'Electrical'], dtype='object')


#### minimum sample/class size

In [5]:
(1.96 * statistics.stdev(house_df_use['SalePrice']) / 20000)**2

53.87076177104101

#### feature engineering

In [6]:
def feature_eng(df):
    # calculate age of building
    df['BldgAge'] = df['YrSold'] - df['YearBuilt']

    # binarize YearRemodAdd
    df['Remodeled'] = np.where(df['YearRemodAdd'] == df['YearBuilt'], 0, 1)

    # binarize MSSubClass to PUD or not PUD
    df['IsPUD'] = np.where(df['MSSubClass'].isin(['120','150','160','180']), 1, 0)
    
    # binarize LotShape to Reg/not Reg
    df['LotIsReg'] = np.where(df['LotShape']=='Reg', 1, 0)
    
    # binarize LandContour to HLS and Low vs not HLS or Low
    df['HillORDepr'] = np.where(df['LandContour'].isin(['HLS','Low']), 1, 0)
    
    # binarize Condition1/2 to positive feature or no positive feature
    df['PosFeat'] = np.where(df['Condition1'].isin(['PosN','PosA'])|df['Condition2'].isin(['PosN','PosA']), 1, 0)

    # combine exterior material 1/2 to one column
    df['ExtMatl'] = np.where((df['Exterior1st']==df['Exterior2nd']),df['Exterior1st'], 'Mixed')
    
    # covnert Electrical to circuit breakers/
    #df['SBrkrElecOnly']=np.where(df['Electrical']=='SBrkr',1,0)
    
    # simply qual/cond features
    for col in ['ExterQual','BsmtQual','KitchenQual','FireplaceQu','GarageQual','ExterCond',\
                'BsmtCond','GarageCond','HeatingQC']:
        df[col+'_num'] = df[col].replace(['Ex','Gd','TA','Fa','Po','None'],[10,8,6,4,2,0])

    # sum up porch area
    df['Total_porch_sf'] = df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] +\
                                df['ScreenPorch'] + df['WoodDeckSF']
    
    # binarize fences
    df['HasFence'] = np.where(df['Fence']=='None', 0, 1)

    # simplify Functional to 3 classes
    df['Funct_3'] = df['Functional'].replace(['Maj1', 'Maj2', 'Min1', 'Min2', 'Mod', 'Sal', 'Typ'],\
                                             ['ModToSev','ModToSev','Minor','Minor','ModToSev','ModToSev','Normal'])
    
    # binarize sale condition to normal sale condition or not sale condition
    #df['NormalSaleCond'] = np.where(df['SaleCondition']=='Normal', 1, 0)
    
    # drop the original columns or unused columns
    df = df.drop(['MSSubClass','YearBuilt','YearRemodAdd','LotFrontage','LotArea','LotConfig','LandSlope',\
                  'Condition1','Condition2','Exterior1st','Exterior2nd','LotShape','LandContour',\
                  'OpenPorchSF','3SsnPorch','EnclosedPorch','ScreenPorch','WoodDeckSF',\
                  'Fence','Functional','ExterQual','BsmtQual','KitchenQual','FireplaceQu',\
                  'GarageQual','ExterCond','BsmtCond','GarageCond','HeatingQC',\
                  'GarageCars','RoofMatl','RoofStyle','KitchenAbvGr','MSZoning'],\
                 axis = 1)
    return df

In [7]:
house_df_eng = feature_eng(house_df_use)

In [8]:
house_df_eng.columns

Index(['SalePrice', 'Distance', 'Alley', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'Foundation', 'BsmtExposure',
       'CentralAir', 'Electrical', 'TotRmsAbvGrd', 'Fireplaces', 'GarageType',
       'GarageFinish', 'GarageArea', 'PavedDrive', 'MoSold', 'YrSold',
       'SaleCondition', 'MasVnrArea2', 'total_LivArea', 'num_bathroom',
       'SchD_S', 'BldgAge', 'Remodeled', 'IsPUD', 'LotIsReg', 'HillORDepr',
       'PosFeat', 'ExtMatl', 'ExterQual_num', 'BsmtQual_num',
       'KitchenQual_num', 'FireplaceQu_num', 'GarageQual_num', 'ExterCond_num',
       'BsmtCond_num', 'GarageCond_num', 'HeatingQC_num', 'Total_porch_sf',
       'HasFence', 'Funct_3'],
      dtype='object')

#### dummification and log transformation

In [9]:
# dummify the dataset
house_df_dum = pd.get_dummies(house_df_eng, drop_first=True)
print(house_df_dum.shape)
print(house_df_dum.shape)

# apply log()
house_df_dum['SalePrice'] = np.log(house_df_dum['SalePrice'])
house_df_dum['total_LivArea'] = np.log(house_df_dum['total_LivArea'])
#house_df_dum['Distance'] = np.log(X_train_dum['Distance'])
house_df_dum['Total_porch_sf'] = np.log(house_df_dum['Total_porch_sf']+1)

(2557, 84)
(2557, 84)


#### test-train split

In [10]:
len(house_df_dum[house_df_dum['YrSold']==2010])

314

In [11]:
train = house_df_dum[house_df_dum['YrSold']!=2010]
test = house_df_dum[house_df_dum['YrSold']==2010]
print(train.shape)
print(test.shape)

X_train = train.drop(['SalePrice','YrSold'],axis=1)
y_train = train['SalePrice']
X_test = test.drop(['SalePrice','YrSold'],axis=1)
y_test = test['SalePrice']

(2243, 84)
(314, 84)


#### linear regression, p-values, and VIFs

In [12]:
lin_reg = LinearRegression().fit(X_train, y_train)
lin_reg.score(X_train, y_train)

x_feature = sm.add_constant(X_train)

model = sm.OLS(y_train, x_feature)
results_feature = model.fit()
print(results_feature.summary())
pValue = results_feature.pvalues
pValue[pValue<0.05]

#X_vif = X_train_dum[pValue[pValue<0.05].index]
X_vif = X_train[pValue[pValue<0.05].drop('const').index]
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

0.9135884502297468

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.910
Method:                 Least Squares   F-statistic:                     278.5
Date:                Sun, 29 Nov 2020   Prob (F-statistic):               0.00
Time:                        11:43:57   Log-Likelihood:                 1720.8
No. Observations:                2243   AIC:                            -3276.
Df Residuals:                    2160   BIC:                            -2801.
Df Model:                          82                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     8.48

const                    0.000000e+00
Distance                 7.342637e-12
OverallQual              2.061792e-73
OverallCond              2.053056e-39
TotRmsAbvGrd             1.885858e-31
Fireplaces               3.340927e-09
GarageArea               1.444154e-31
total_LivArea            1.543069e-96
num_bathroom             2.232136e-06
BldgAge                  2.088693e-12
LotIsReg                 4.522439e-03
HillORDepr               1.224148e-05
ExterQual_num            5.100802e-03
BsmtQual_num             2.948366e-06
KitchenQual_num          1.095704e-05
HeatingQC_num            1.912565e-04
Total_porch_sf           1.444937e-02
BldgType_2fmCon          1.264813e-03
BldgType_Duplex          1.148426e-09
BldgType_Twnhs           7.034307e-03
HouseStyle_1Story        5.700589e-03
HouseStyle_2Story        5.085207e-03
HouseStyle_SLvl          4.581167e-03
Foundation_CBlock        4.059015e-02
Foundation_PConc         3.918517e-03
BsmtExposure_Gd          2.862583e-03
BsmtExposure

                  feature         VIF
0                Distance   15.634547
1             OverallQual   71.727564
2             OverallCond   40.114700
3            TotRmsAbvGrd   39.632187
4              Fireplaces    2.776179
5              GarageArea   10.525371
6           total_LivArea  327.026423
7            num_bathroom   20.995611
8                 BldgAge   13.020247
9                LotIsReg    3.265150
10             HillORDepr    1.197003
11          ExterQual_num  107.570941
12           BsmtQual_num   40.307674
13        KitchenQual_num   72.726906
14          HeatingQC_num   33.735544
15         Total_porch_sf    7.170602
16        BldgType_2fmCon    1.185849
17        BldgType_Duplex    1.357928
18         BldgType_Twnhs    1.216124
19      HouseStyle_1Story    5.369455
20      HouseStyle_2Story    3.958848
21        HouseStyle_SLvl    1.473855
22      Foundation_CBlock    6.580818
23       Foundation_PConc    9.633890
24        BsmtExposure_Gd    1.874520
25        Bs

#### train and test error

In [13]:
lin_reg.score(X_train, y_train)
lin_reg.score(X_test, y_test)

0.9135884502297468

0.8495734360021537

#### lasso

In [14]:
X_train_lasso = X_train.copy()
y_train_lasso = y_train.copy()
X_test_lasso = X_test.copy()
y_test_lasso = y_test.copy()

In [15]:
# best best fit alpha
reg_lasso_cv = LassoCV(normalize = True, n_alphas = 200)
reg_lasso_cv.fit(X_train_lasso, y_train_lasso)
reg_lasso_cv.alpha_

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=200, n_jobs=None, normalize=True,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

3.0508293036878217e-05

In [16]:
# put the alpha value back in the original lasso
reg_lasso = Lasso(normalize=True)
reg_lasso.set_params(alpha = 2.846e-05)
reg_lasso.fit(X_train_lasso, y_train_lasso)
lasso_coef_df = pd.DataFrame({'feature':X_train_lasso.columns,
                              'coef':reg_lasso.coef_})
lasso_coef_df[lasso_coef_df['coef']>0]

Lasso(alpha=2.846e-05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

Lasso(alpha=2.846e-05, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

Unnamed: 0,feature,coef
2,OverallQual,0.068231
3,OverallCond,0.036547
4,TotRmsAbvGrd,0.028729
5,Fireplaces,0.045729
6,GarageArea,0.00022
8,MasVnrArea2,0.002674
9,total_LivArea,0.287412
10,num_bathroom,0.022064
15,HillORDepr,0.048149
16,PosFeat,0.017976


#### test and train errors (lasso)

In [17]:
print(reg_lasso.score(X_train_lasso, y_train_lasso))
reg_lasso.score(X_test_lasso, y_test_lasso)

0.9122061513905891


0.8445759479422786

In [None]:
# Decision Tree Model
tree_reg = DecisionTreeRegressor(max_depth=10).fit(X_train1,y_train1)
print(f'R^2 of Train set: {tree_reg.score(X_train1,y_train1)}')
print(f'R^2 Test set: {tree_reg.score(X_test1,y_test1)}')



In [None]:
# Random Forest Model
forest_reg = RandomForestRegressor(n_estimators=100,max_features=5).fit(X_train1,y_train1)
print(f'R^2 of Train set: {forest_reg.score(X_train1,y_train1)}')
print(f'R^2 Test set: {forest_reg.score(X_test1,y_test1)}')

In [None]:
# from sklearn.metrics import mean_squared_error
# from sklearn.metrics import r2_score
# housing_prediction = lin_reg.predict(X_test)
# mean_squared_error(y_train,y_test)
# lin_reg2 = LinearRegression().fit(X_train,y_train)
# lin_reg.r2_score