In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR, LinearSVR

import pickle

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [2]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df=df.drop(non_dummies,axis=1)
    return df

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')

In [5]:
list(df.columns)

['PID',
 'GrLivArea',
 'SalePrice',
 'LotFrontage',
 'LotArea',
 'Street_paved',
 'Alley',
 'LandContour',
 'Utilities',
 'LandSlope',
 'Neighborhood',
 'BldgType',
 'OverallQual',
 'OverallCond',
 'RoofStyle',
 'RoofMatl',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'CentralAir',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition',
 'SalePrice_log',
 'HeatingQC_ord',
 'LotShape_com',
 'MSZoning_com',
 'LF_Near_NS_RR',
 'LF_Near_Positive_Feature',
 'LF_Adjacent_Arterial_St',
 'LF_Near_EW_RR',
 'LF_Adj

In [6]:
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
pid_drop_list = [
    905450020,
    902477120,
    531477050,
    916403040,
    916252170,
    916253320,
    902401130,
    902205020,
    907230240,
    916477060,
    912251110,
    902103150,
    911175360,
    908154040,
    909129100,
    904101170,
    923125030,
    902205010,
    902401120,
    535300120,
    535426150
]

mask = df['PID'].map(lambda x: False if x in pid_drop_list else True)
df = df[mask]

df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [7]:
#separate price from table
price=df['SalePrice']
price_log = np.log10(price)
df=df.drop(['SalePrice_log'],axis=1)

In [8]:
#add radial average price feature
avg_price_df=pd.read_csv('./../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

df2=df.merge(avg_price_df2,how='left')

In [9]:
#add geographical features
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]
radial.drop(columns = rad_drops, inplace = True)

df2=df2.merge(radial, how='left')

In [10]:
#other rows to drop:
droplist=['GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
df2=df2.drop(droplist,axis=1)

In [11]:
#fillnas
df2=df2.fillna(0)

In [12]:
scaler = MinMaxScaler()

def fit_scale(col):
    scaler.fit(df2[[col]])
    df2[[col]]=scaler.transform(df2[[col]])

fit_scale('OverallQual')
fit_scale('ExterQual')
fit_scale('OverallCond')
fit_scale('KitchenQual')

df2['PorchSF']=df2['OpenPorchSF']+df2['EnclosedPorch']+df2['3SsnPorch']+df2['ScreenPorch']


df2['SaleTypeNew']=(df2['SaleType']=='New')
df2['SaleTypeNew']=df2['SaleTypeNew'].apply(lambda x: 1 if x == True else 0)


df2['BSMT_LowQual']=df2['TotalBsmtSF']-df2['BSMT_GLQ']-df2['BSMT_ALQ']
df2['BSMT_HighQual']=df2['BSMT_GLQ']+df2['BSMT_ALQ']

In [13]:
features_to_use=[
    ### placeholders
    'PID',
    'SalePrice',
    
    ### from original dataset
    'GrLivArea', 
    'LotArea', 
    'OverallQual',
    'BSMT_LowQual', 
    'house_age_years', 
    'GarageCars',
    'MasVnrType',
    'FullBath',
    'HalfBath',
    'BsmtExposure_ord',
    'SaleTypeNew',
    'Neighborhood',
    'BldgType',
    'PorchSF',
    'BSMT_HighQual',
    'Fireplaces',
    'Pool',
    'BedroomAbvGr',
    'ExterQual',
    'OverallCond',
    'KitchenQual',
    
    ### from radial location data for catboost
    'water_tower',
    'graveyard',
    'police', 
    'optician',
    'slipway',
    'bar',
    'cinema',
    'supermarket',
    'hotel',
    'stop',
    'farmyard',
    'christian_catholic', 
    'jewish',
    'muslim',
    'garden_centre',
    'christian_lutheran'                 
]

In [14]:
front_end=df2[features_to_use]

# front_end.to_csv('./../data/ames_housing_price_data_v6.csv')

In [15]:
#function and dictionaries required to transform front-end to back-end
dummies = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone'
    ]

non_dummies=['Neighborhood', 'BldgType', 'MasVnrType']

In [16]:
#transformation of front-end to back-end, and catboost application
back_end = front_end.copy()
back_end.drop(columns = ['PID', 'SalePrice'], axis = 1, inplace = True)
back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

back_end = dummify(back_end, non_dummies, dummies)

cbl = CatBoostRegressor();
cbl.load_model("./../Moritz/HousePriceCatBoost", "cbm")
cbl_pred = cbl.predict(back_end)

In [17]:
dummies_linear = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone',
    'BSMT_HighQual_bin_500-1000',
    'BSMT_HighQual_bin_0-500',
    'BSMT_HighQual_bin_1000-1500',
    'BSMT_HighQual_bin_1500+',
    'BSMT_LowQual_bin_0-500',
    'BSMT_LowQual_bin_500-1000',
    'BSMT_LowQual_bin_1000-1500',
    'BSMT_LowQual_bin_1500+'
    ]

non_dummies_linear = ['Neighborhood', 'BldgType', 'MasVnrType', 'BSMT_HighQual_bin', 'BSMT_LowQual_bin']

In [18]:
back_end_linear = front_end.copy()
back_end_linear.drop(columns = ['PID', 'SalePrice'], axis = 1, inplace = True)
back_end_linear['GrLivArea_log'] = np.log10(back_end_linear['GrLivArea'])
back_end_linear['LotArea_log'] = np.log10(back_end_linear['LotArea'])
back_end_linear['ExterQualDisc'] = back_end_linear['ExterQual'] - back_end_linear['OverallQual']
back_end_linear['OverallCondDisc'] = back_end_linear['OverallCond'] - back_end_linear['OverallQual']
back_end_linear['KitchenQualDisc'] = back_end_linear['KitchenQual'] - back_end_linear['OverallQual']
back_end_linear = back_end_linear.drop(['ExterQual','OverallCond','KitchenQual'], axis=1)

back_end_linear['BSMT_LowQual_bin'] = pd.cut(back_end_linear['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear['BSMT_HighQual_bin'] = pd.cut(back_end_linear['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear.drop(['BSMT_HighQual', 'BSMT_LowQual', 'GrLivArea', 'LotArea'], axis = 1, inplace = True)

back_end_linear = dummify(back_end_linear, non_dummies_linear, dummies_linear)

In [19]:
# # code used to generate model

# kfold = KFold(n_splits=5, shuffle = True, random_state = 1)
# params_log = {'alpha' : [1e-7, 1e-6, 1e-5, 1e-4]
#           }
# lasso = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
# lasso_tuner = GridSearchCV(lasso, params_log, cv=kfold, return_train_score = True)
# lasso_tuner.fit(back_end_linear, np.log10(price))
# with open('linearmodel.pickle', mode = 'wb') as file:
#     pickle.dump(lasso_tuner.best_estimator_, file)

In [57]:
with open('linearmodel.pickle', mode = 'rb') as file:
    lm = pickle.load(file)

preds_lm = 10**lm.predict(back_end_linear)

In [21]:
lm.score(back_end_linear, price_log)

0.9397351388453661

In [22]:
feat_imp_lin = pd.Series(data = lm.coef_, index = back_end_linear.columns)
feat_imp_lin = feat_imp_lin.sort_values(ascending = False)
ignored_lin = feat_imp_lin[feat_imp_lin == 0]
feat_imp_lin = feat_imp_lin[feat_imp_lin != 0]
print(len(feat_imp_lin))
print(feat_imp_lin)
print(len(ignored_lin))
print(ignored_lin)

69
GrLivArea_log                  0.429077
OverallQual                    0.347738
OverallCondDisc                0.132957
BSMT_HighQual_bin_1500+        0.122300
LotArea_log                    0.102171
BSMT_HighQual_bin_1000-1500    0.083511
Neighborhood_Crawfor           0.053938
Neighborhood_StoneBr           0.050696
BSMT_LowQual_bin_1500+         0.049974
BSMT_HighQual_bin_500-1000     0.047652
BSMT_LowQual_bin_1000-1500     0.046351
garden_centre                  0.038379
KitchenQualDisc                0.034571
BSMT_LowQual_bin_500-1000      0.029659
Neighborhood_NridgHt           0.028675
ExterQualDisc                  0.027102
Neighborhood_Blueste           0.025467
Neighborhood_Greens            0.024994
Neighborhood_NPkVill           0.023628
Neighborhood_NoRidge           0.022447
Neighborhood_Timber            0.021883
Neighborhood_Somerst           0.021333
BSMT_HighQual_bin_0-500        0.020100
SaleTypeNew                    0.019075
BSMT_LowQual_bin_0-500         0.0183

In [23]:
svr = SVR(kernel = 'linear')

In [24]:
kfold = KFold(n_splits=5, shuffle = True, random_state = 0)

In [25]:
svr_back_end = back_end_linear.copy()

In [26]:
svr_backend_scaler = StandardScaler()
svr_price_scaler = StandardScaler()

In [27]:
back_end_svr = pd.DataFrame(svr_backend_scaler.fit_transform(back_end_linear), columns = back_end_linear.columns)
price_std = pd.DataFrame(svr_price_scaler.fit_transform(np.array(price_log).reshape(-1,1)))

In [28]:
price_log_std = price_std[0]

In [29]:
params = {
    'C' : [100, 250, 500],
    'epsilon' : [0.01, 0.1, 1]
}

In [30]:
svr_tuner = GridSearchCV(svr, params, cv = kfold, verbose = 2, n_jobs = 4)

In [31]:
# svr_tuner.fit(back_end_svr, price_log_std)

In [None]:
# svr_tuner.best_score_ #old

In [None]:
# svr_model = svr_tuner.best_estimator_

In [None]:
# with open('SVR_model2.pickle', mode = 'wb') as file:
#      pickle.dump(svr_tuner.best_estimator_, file)

In [34]:
with open('SVR_model.pickle', mode = 'rb') as file:
    svr_lin = pickle.load(file)

In [35]:
svr_lin.get_params()

{'C': 100,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'linear',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [36]:
feat_imp_svr = pd.Series(data = svr_tuner.best_estimator_.coef_[0], index = back_end_linear.columns)
feat_imp_svr = feat_imp_svr.sort_values(ascending = False)
print(feat_imp_svr)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [52]:
preds_svr_lin = 10**svr_price_scaler.inverse_transform(svr_lin.predict(back_end_svr))

In [53]:
fig = px.scatter(x = preds_svr_lin, y = price)
fig.update_layout(
                    xaxis_title = 'predicted',
                    yaxis_title = 'actual'
                )

In [62]:
fig = px.scatter(x = preds_lm, y = price)
fig.update_layout(
                    xaxis_title = 'predicted',
                    yaxis_title = 'actual'
                )

In [63]:
fig = px.scatter(x = preds_cbl, y = price)
fig.update_layout(
                    xaxis_title = 'predicted',
                    yaxis_title = 'actual'
                )

In [76]:
resids_svr_lin = preds_svr_lin - price 
pd.DataFrame(resids_svr_lin).describe()

Unnamed: 0,SalePrice
count,2455.0
mean,-937.752494
std,18810.76335
min,-169950.481521
25%,-8434.732021
50%,-219.844162
75%,8317.159426
max,105415.394926


In [77]:
resids_lm = preds_lm - price
pd.DataFrame(resids_lm).describe()

Unnamed: 0,SalePrice
count,2455.0
mean,-867.049023
std,18408.182714
min,-158368.835551
25%,-9101.285975
50%,-455.848276
75%,8467.482896
max,102862.062172


In [78]:
preds_cbl = cbl.predict(back_end)
resids_cbl = preds_cbl - price
pd.DataFrame(resids_cbl).describe()

Unnamed: 0,SalePrice
count,2455.0
mean,1.220756
std,11112.664802
min,-63946.66877
25%,-6597.414896
50%,-19.588876
75%,6567.162572
max,54612.580812


In [85]:
resid_scaler = MinMaxScaler(feature_range = (1, 10))

In [98]:
price_score = resid_scaler.fit_transform(pd.DataFrame(resids_cbl))

In [99]:
price_score

array([[4.99959396],
       [5.60849168],
       [5.56351311],
       ...,
       [6.63645982],
       [6.26883022],
       [6.58283688]])

In [100]:
front_end['price_score'] = price_score



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [101]:
front_end.loc[:,['PID','price_score']]

Unnamed: 0,PID,price_score
0,909176150,4.999594
1,905476230,5.608492
2,535377150,5.563513
3,534177230,5.554598
4,908128060,5.224652
...,...,...
2450,903205040,5.728736
2451,905402060,5.522612
2452,909275030,6.636460
2453,907192040,6.268830


In [102]:
px.histogram(front_end, x = 'price_score')

In [107]:
from dictionaries import *;
from RegressorEncapsulation import *;
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class EncapsulatedModel(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, method="cat", instance = CatBoostRegressor()):
        self.method = method;
        self.instance = instance;
        self.x_scaler = StandardScaler();
        self.y_scaler = StandardScaler();
        self.fitted = False;
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        back_end = front_to_back(X, self.method, self.x_scaler, True);
        y_proc = predictor_processing(y, self.method, self.y_scaler);
        self.instance.fit(back_end, y_proc);
        self.fitted = True;
        return self;
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        return predict_from_front(X, self.method, self.instance, self.x_scaler, self.y_scaler);

ModuleNotFoundError: No module named 'dictionaries'

In [106]:
with open('./../Gaussian-SVR.pickle', mode = 'rb') as file:
    svrg = pickle.load(file)

AttributeError: Can't get attribute 'EncapsulatedModel' on <module '__main__'>