In [579]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
import time
import re
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [580]:
pd.set_option('display.max_rows', 500)

In [581]:
def dummify_linear(data, to_dummify, drops):
    
    bsmt_categoricals = [
        'BsmtCond_ord',
        'BsmtQual_ord',
        'BsmtExposure_ord']
    
    grg_categoricals = [ 
        'GarageQual',
        'GarageCond',
        'GarageType_com',
        'Garage_age_bin',
        'GarageFinish']
    
    dummies = to_dummify
        
    for item in drops:
        if item in grg_categoricals:
            grg_categoricals.remove(item)      
        if item in dummies:
            dummies.remove(item)
        if item in bsmt_categoricals:
            bsmt_categoricals.remove(item)
    
    df = data.drop(columns = drops, axis = 1)
    print('before dummifying: ', df.shape)
    df = pd.get_dummies(df, columns = grg_categoricals)
    for col in grg_categoricals:
        if f'{col}_0' in df.columns:
            df = df.drop(columns = [f'{col}_0'], axis = 1)
        if f'{col}_No garage' in df.columns:
            df = df.drop(columns = [f'{col}_No garage'], axis = 1)
    print('after garage stuff: ', df.shape)     
    df = pd.get_dummies(df, columns = bsmt_categoricals)
    for col in bsmt_categoricals:
        if f'{col}_0' in df.columns:
            df = df.drop(columns = [f'{col}_0'], axis = 1)
    print('after basement stuff: ', df.shape)
    
    df = pd.get_dummies(df, columns = to_dummify, drop_first = True)
    print('after full dummification: ', df.shape)
    
    return df

In [582]:
def drop_dummies(dummified, orig_col):
    keep_list = []
    for col in dummified.columns:
        if not f'{orig_col}_' in col:
            keep_list.append(col)
    return dummified.loc[:, keep_list]

In [583]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

price = df.loc[:,'SalePrice']
price_log = df.loc[:,'SalePrice_log']

In [625]:
df['PID']

0       909176150
1       905476230
2       535377150
3       534177230
4       908128060
          ...    
2466    903205040
2467    905402060
2468    909275030
2469    907192040
2470    906223180
Name: PID, Length: 2471, dtype: int64

In [516]:
all_dummies = to_dummify + garage_categoricals + basement_categoricals

In [517]:
df2 = df.drop(columns = always_drop, axis = 1)
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

In [8]:
# kfold = KFold(n_splits=5, shuffle = True, random_state = 1)
# folds = list(kfold.split(df3))
# xtrain = [0]*5
# ytrain = [0]*5
# xtest = [0]*5
# ytest = [0]*5
# for i in range(0,5):
#     xtrain[i] = df3.loc[folds[i][0],:].drop(columns = 'SalePrice_log')
#     ytrain[i] = df3.loc[folds[i][0],'SalePrice_log']
#     xtest[i] = df3.loc[folds[i][1],:].drop(columns = 'SalePrice_log')
#     ytest[i] = df3.loc[folds[i][1],'SalePrice_log']

# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total2={}
# j=1
# xtrain_red = xtrain.copy()
# xtest_red = xtest.copy()
# start = datetime.now()
# while len(feature_selection_df.columns)> 50:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             xtrain_red = list(map(lambda x: drop_dummies(x, col), xtrain))
#             xtest_red = list(map(lambda x: drop_dummies(x, col), xtest))
#         else:
#             xtrain_red = list(map(lambda x: x.drop(col, axis = 1), xtrain))
#             xtest_red = list(map(lambda x: x.drop(col, axis = 1), xtest))
            
#         #adjust these lines to reflect your model
#         models = list(map(lambda x, y: Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x, y), xtrain, ytrain))
#         scores = list(map(lambda x, y, z: x.score(y, z), models, xtest, ytest))
#         score_dict[col] = np.mean(scores)
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total2[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col, axis=1)
#     if min_col in all_dummies:
#         xtrain = list(map(lambda x: drop_dummies(x, min_col), xtrain))
#         xtest = list(map(lambda x: drop_dummies(x, min_col), xtest))
#     else:
#         xtrain = list(map(lambda x: x.drop(min_col, axis = 1), xtrain))
#         xtest = list(map(lambda x: x.drop(min_col, axis = 1), xtest))
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [9]:
# col_dict_total2

In [10]:
# folds = list(kfold.split(df3))
# xtrain = [0]*5
# ytrain = [0]*5
# xtest = [0]*5
# ytest = [0]*5
# for i in range(0,5):
#     xtrain[i] = df3.loc[folds[i][0],:].drop(columns = 'SalePrice_log')
#     ytrain[i] = df3.loc[folds[i][0],'SalePrice_log']
#     xtest[i] = df3.loc[folds[i][1],:].drop(columns = 'SalePrice_log')
#     ytest[i] = df3.loc[folds[i][1],'SalePrice_log']

# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total_neighborhood={}
# j=1
# xtrain_red = xtrain.copy()
# xtest_red = xtest.copy()
# start = datetime.now()
# while len(feature_selection_df.columns)> 50:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             xtrain_red = list(map(lambda x: drop_dummies(x, col), xtrain))
#             xtest_red = list(map(lambda x: drop_dummies(x, col), xtest))
#         else:
#             xtrain_red = list(map(lambda x: x.drop(col, axis = 1), xtrain))
#             xtest_red = list(map(lambda x: x.drop(col, axis = 1), xtest))
            
#         #adjust these lines to reflect your model
#         models = list(map(lambda x, y: Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x, y), xtrain, ytrain))
#         scores = list(map(lambda x, y, z: x.score(y, z), models, xtest, ytest))
#         score_dict[col] = np.mean(scores)
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total_neighborhood[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col, axis=1)
#     if min_col in all_dummies:
#         xtrain = list(map(lambda x: drop_dummies(x, min_col), xtrain))
#         xtest = list(map(lambda x: drop_dummies(x, min_col), xtest))
#     else:
#         xtrain = list(map(lambda x: x.drop(min_col, axis = 1), xtrain))
#         xtest = list(map(lambda x: x.drop(min_col, axis = 1), xtest))
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [11]:
# col_dict_total_neighborhood

In [12]:
# score_dict_total

In [13]:
# train, test = train_test_split(df3, test_size = 0.2, shuffle = True, random_state = 0)
# x_train = train.drop(columns = 'SalePrice_log')
# x_test = test.drop(columns = 'SalePrice_log')
# y_train = train['SalePrice_log']
# y_test = test['SalePrice_log']

In [14]:
# x_train2, x_test2, y_train2, y_test2 = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total={}
# j=1
# start = datetime.now()
# while len(feature_selection_df.columns)> 20:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             x_train_red = drop_dummies(x_train2, col)
#             x_test_red = drop_dummies(x_test2, col)
#         else:
#             x_train_red = x_train2.drop(col, axis = 1)
#             x_test_red = x_test2.drop(col, axis = 1)
#         #adjust these lines to reflect your model
#         model = Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x_train_red, y_train2)
#         score=model.score(x_test_red, y_test2)
#         score_dict[col]=score
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col,axis=1)
#     if min_col in all_dummies:
#         x_train2 = drop_dummies(x_train2, min_col)
#         x_test2 = drop_dummies(x_test2, min_col)
#     else:
#         x_train2 = x_train2.drop(min_col, axis = 1)
#         x_test2 = x_test2.drop(min_col, axis = 1)
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [15]:
droplist = ['TotalBsmtSF',
            'BsmtCond_ord',
            'BsmtQual_ord',
            'GarageCond',
            'GarageQual',
            'GarageType_com',
            'SalePrice_log',
            'Garage_age_bin',
            'Remod_age_bin',
            '1stFlrSF_log',
            '2ndFlrSF',
            'KitchenAbvGr',
            'TotRmsAbvGrd',
            'GarageArea',
            'GarageFinish',
            'Fence',
            'Alley',
            'number_floors',
            'FireplaceQu',
            'LotFrontage',
            'LowQualFinSF',
            'BsmtExposure_ord',
            'MasVnrArea'
           ]

droplist3 = ['SalePrice_log']

In [16]:
df3 = dummify_linear(df2, to_dummify, droplist)

before dummifying:  (2471, 58)
after garage stuff:  (2471, 58)
after basement stuff:  (2471, 58)
after full dummification:  (2471, 124)


In [17]:
params_log = {'alpha' : [1e-7, 1e-6, 1e-5, 1e-4]
          }

In [18]:
lasso2 = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
lasso_tuner2 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner2.fit(df3, price_log)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [19]:
#lasso_tuner2.cv_results_

In [20]:
lasso_tuner2.cv_results_['mean_test_score']

array([0.94744747, 0.94757162, 0.94470538, 0.91400025])

In [21]:
lasso_tuner2.cv_results_['mean_train_score']

array([0.95496864, 0.95486537, 0.95097223, 0.91901587])

In [22]:
feat_imp_log = pd.Series(data = lasso_tuner2.best_estimator_.coef_, index = df3.columns)
feat_imp_log = feat_imp_log.sort_values(ascending = False)
ignored_log = feat_imp_log[feat_imp_log == 0]
feat_imp_log = feat_imp_log[feat_imp_log != 0]
print(len(feat_imp_log))
print(feat_imp_log)
print(len(ignored_log))
print(ignored_log)

118
GrLivArea_log                                                 0.424990
OverallQual_10                                                0.184668
Neighborhood_GrnHill                                          0.181966
OverallQual_9                                                 0.152077
OverallCond_9                                                 0.135713
OverallCond_8                                                 0.120620
OverallCond_7                                                 0.108497
OverallQual_8                                                 0.105630
LotArea_log                                                   0.098563
OverallCond_6                                                 0.084857
OverallQual_7                                                 0.074798
OverallCond_5                                                 0.068410
Functional_ord_7                                              0.067255
OverallQual_6                                                 0.055466
Fu

In [23]:
# preds_log = lasso_tuner2.predict(df4)

# fig = px.scatter(x = preds_log, y = price_log, hover_name = df['PID'])
# fig.update_layout(
#                     xaxis_title = 'predicted',
#                     yaxis_title = 'actual'
#                 )

In [24]:
# resid = abs(preds_log - price_log)

# fig = px.histogram(x = resid)
# fig.update_layout(
#                     xaxis_title = 'residuals'
#                 )

In [25]:
# 10**np.mean(price_log)*(1-lasso_tuner2.best_score_)

In [26]:
df4 = dummify_linear(df2, to_dummify, droplist)

before dummifying:  (2471, 58)
after garage stuff:  (2471, 58)
after basement stuff:  (2471, 58)
after full dummification:  (2471, 124)


In [27]:
vif_df = pd.DataFrame()
vif_df['feature'] = df4.columns
vif_df['vif'] = [variance_inflation_factor(df4.values, i)
                          for i in range(len(df4.columns))]

In [28]:
vif_df.sort_values(by = 'vif', ascending = False)

Unnamed: 0,feature,vif
32,GrLivArea_log,1778.268217
18,LotArea_log,1091.801677
107,"MSZoning_com_Residential, low density",110.706511
123,Functional_ord_7,107.26729
87,ExterQual_3,86.996665
119,PUD_1,69.835688
113,"Electrical_com_Standard circuit breakers, all ...",68.648088
90,ExterCond_3,59.048342
88,ExterQual_4,50.074533
69,BldgType_TwnhsE,45.904481


In [29]:
print('No radials: ', droplist, ': ', lasso_tuner2.best_params_['alpha'], ', ', max(lasso_tuner2.cv_results_['mean_test_score']), ', ', sum(vif_df['vif'])/len(vif_df))

No radials:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea'] :  1e-06 ,  0.9475716176776527 ,  36.054356259285186


log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin'] :  1e-06 ,  0.943454281780628

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log'] :  1e-06 ,  0.9424740060473317

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF'] :  1e-06 ,  0.9420770647332499

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr'] :  1e-06 ,  0.9417616764602397

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea'] :  1e-06 ,  0.9416195613781179 ,  39.9001979416710

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence'] :  1e-06 ,  0.9417193728756516 ,  41.05487105565105

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence'] :  1e-06 ,  0.9416530339287166 ,  39.99237976054064

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley'] :  1e-06 ,  0.9417665575176081 ,  41.1929734948757

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors'] :  1e-06 ,  0.9418035097906687 ,  39.559875261890845

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu'] :  1e-06 ,  0.9417500104695516 ,  40.30945260646638

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea'] :  1e-06 ,  0.9468407408759749 ,  36.57884336374308

No radials:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea', 'LotShape_com'] :  1e-06 ,  0.946833771716485 ,  35.69657081462711

In [30]:
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)

rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]

radial.drop(columns = rad_drops, inplace = True)
sub = df.loc[:,['PID', 'SalePrice_log']]
radial = pd.merge(radial, sub, how = 'right', on = 'PID')

In [31]:
radial.drop(columns = ['PID','SalePrice_log'], inplace = True)

In [32]:
lasso_tuner3 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner3.fit(radial, price_log)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [33]:
lasso_tuner3.cv_results_['mean_test_score']

array([0.54590049, 0.54646727, 0.54404021, 0.46670631])

In [34]:
lasso_tuner3.cv_results_['mean_train_score']

array([0.58646805, 0.58634686, 0.57780744, 0.48220605])

In [35]:
len(radial.columns)

81

In [36]:
feat_imp_rad = pd.Series(data = lasso_tuner3.best_estimator_.coef_, index = radial.columns)
feat_imp_rad = feat_imp_rad.sort_values(ascending = False)
ignored_rad = feat_imp_rad[feat_imp_rad == 0]
feat_imp_rad = feat_imp_rad[feat_imp_rad != 0]
print(len(feat_imp_rad))
print(feat_imp_rad)
print(len(ignored_rad))
print(ignored_rad)

81
slipway                  0.166635
garden_centre            0.157494
guesthouse               0.101396
stop                     0.083934
library                  0.073591
dam                      0.059375
furniture_shop           0.050300
christian_catholic       0.045075
stationery               0.040657
viewpoint                0.040383
laundry                  0.039904
tourist_info             0.039035
pier                     0.037632
theatre                  0.036032
recreation_ground        0.034843
pub                      0.034486
dog_park                 0.032927
water_tower              0.031247
doctors                  0.029168
apron                    0.026762
scrub                    0.025278
hotel                    0.024577
shoe_shop                0.021256
jewish                   0.019733
pharmacy                 0.017854
christian_lutheran       0.017127
playground               0.016967
fire_station             0.015375
doityourself             0.014012
nursing_hom

In [37]:
vif_rad = pd.DataFrame()
vif_rad['feature'] = radial.columns
vif_rad['vif'] = [variance_inflation_factor(radial.values, i)
                          for i in range(len(radial.columns))]

In [38]:
print(sum(vif_rad['vif'])/len(vif_rad))
vif_rad.sort_values(by = 'vif', ascending = False)

32.2462129077583


Unnamed: 0,feature,vif
15,playground,116.693882
21,restaurant,115.125299
69,park,102.889255
24,bar,96.291573
43,bank,94.572975
62,parking_multistorey,87.753351
68,forest,84.649819
44,atm,83.004114
61,fuel,76.037614
27,shelter,71.530917


In [176]:
radial.columns

Index(['police', 'fire_station', 'post_box', 'post_office', 'library',
       'nursing_home', 'graveyard', 'school', 'pharmacy', 'hospital',
       'doctors', 'dentist', 'veterinary', 'theatre', 'cinema', 'playground',
       'dog_park', 'sports_centre', 'pitch', 'swimming_pool', 'stadium',
       'restaurant', 'fast_food', 'pub', 'bar', 'hotel', 'guesthouse',
       'shelter', 'supermarket', 'bakery', 'mall', 'florist', 'shoe_shop',
       'optician', 'stationery', 'outdoor_shop', 'mobile_phone_shop',
       'car_dealership', 'doityourself', 'furniture_shop', 'garden_centre',
       'car_wash', 'laundry', 'bank', 'atm', 'tourist_info', 'attraction',
       'museum', 'memorial', 'viewpoint', 'fountain', 'water_tower',
       'water_works', 'christian_catholic', 'christian_evangelical',
       'christian_lutheran', 'christian_methodist', 'jewish', 'muslim', 'stop',
       'motorway_junction', 'fuel', 'parking_multistorey', 'slipway', 'pier',
       'dam', 'airport', 'apron', 'forest', '

In [608]:
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)

In [609]:
df6 = pd.merge(df.copy(), radial, on = 'PID', how = 'left')

In [610]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def fit_scale(df, col):
    scaler.fit(df[[col]])
    df[[col]]=scaler.transform(df[[col]])
fit_scale(df6, 'OverallQual')
fit_scale(df6, 'ExterQual')
fit_scale(df6, 'OverallCond')
fit_scale(df6, 'KitchenQual')
#df2['Porch']=((df2['OpenPorchSF']>0) | (df2['EnclosedPorch']>0) | (df2['3SsnPorch']>0) | (df2['ScreenPorch']>0))
df6['PorchSF']=df6['OpenPorchSF']+df6['EnclosedPorch']+df6['3SsnPorch']+df6['ScreenPorch']
#df2['1stFloorArea%']=df2['1stFlrSF']/df2['GrLivArea']
#df2['2ndFloorArea%']=df2['2ndFlrSF']/df2['GrLivArea']
df6['ExterQualDisc'] = df6['ExterQual'] - df6['OverallQual']
df6['OverallCondDisc'] = df6['OverallCond'] - df6['OverallQual']
df6['KitchenQualDisc'] = df6['KitchenQual'] - df6['OverallQual']
df6['SaleTypeNew']=(df6['SaleType']=='New')
df6['SaleTypeNew']=df6['SaleTypeNew'].apply(lambda x: 1 if x==True else 0)
#df2['BSMT_GLQ%']=df2['BSMT_GLQ']/df2['TotalBsmtSF']
#df2['BSMT_ALQ%']=df2['BSMT_ALQ']/df2['TotalBsmtSF']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ%'].fillna(0)
#df2['BSMT_ALQ%']=df2['BSMT_ALQ%'].fillna(0)
df6['BSMT_LowQual']=df6['TotalBsmtSF']-df6['BSMT_GLQ']-df6['BSMT_ALQ']
df6['BSMT_HighQual']=df6['BSMT_GLQ']+df6['BSMT_ALQ']
df6['AreaPerPerson'] = np.log10(df6['GrLivArea']/df6['BedroomAbvGr'])

In [611]:
df6['BSMT_HighQual_bin'] = pd.cut(df6['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 

In [612]:
df6['BSMT_LowQual_bin'] = pd.cut(df6['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 

In [615]:
feat_incl =[
    ### from original dataset
    'GrLivArea', 
    'LotArea', 
    'OverallQual',
    'BSMT_LowQual', 
    'house_age_years', 
    'GarageCars',
    'MasVnrType',
    'FullBath',
    'HalfBath',
    'BsmtExposure_ord',
    'SaleTypeNew',
    'Neighborhood',
    'BldgType',
    'PorchSF',
    'BSMT_HighQual',
    'Fireplaces',
    'Pool',
    'BedroomAbvGr',
    'ExterQual',
    'OverallCond',
    'KitchenQual',
    
    ### from radial location data
    'water_tower',
    'graveyard',
    'police', 
    'optician',
    'slipway',
    'bar',
    'cinema',
    'supermarket',
    'hotel',
    'stop',
    'farmyard',
    'christian_catholic', 
    'jewish',
    'muslim',
    'garden_centre',
    'christian_lutheran'                 
]

In [589]:
list(radial.columns)

['PID',
 'Address',
 'Coords4',
 'latitude',
 'longitude',
 'police',
 'fire_station',
 'post_box',
 'post_office',
 'library',
 'town_hall',
 'nursing_home',
 'graveyard',
 'university',
 'school',
 'pharmacy',
 'hospital',
 'doctors',
 'dentist',
 'veterinary',
 'theatre',
 'cinema',
 'playground',
 'dog_park',
 'sports_centre',
 'pitch',
 'swimming_pool',
 'golf_course',
 'stadium',
 'track',
 'restaurant',
 'fast_food',
 'cafe',
 'pub',
 'bar',
 'food_court',
 'hotel',
 'motel',
 'guesthouse',
 'shelter',
 'camp_site',
 'supermarket',
 'bakery',
 'mall',
 'department_store',
 'general',
 'convenience',
 'clothes',
 'florist',
 'bookshop',
 'shoe_shop',
 'beverages',
 'optician',
 'jeweller',
 'gift_shop',
 'sports_shop',
 'stationery',
 'outdoor_shop',
 'mobile_phone_shop',
 'beauty_shop',
 'car_dealership',
 'bicycle_shop',
 'doityourself',
 'furniture_shop',
 'computer_shop',
 'garden_centre',
 'hairdresser',
 'car_wash',
 'laundry',
 'bank',
 'atm',
 'tourist_info',
 'attraction

In [616]:
df7 = df6.loc[:,feat_incl]
df7

Unnamed: 0,GrLivArea,LotArea,OverallQual,BSMT_LowQual_bin,house_age_years,GarageCars,MasVnrType,FullBath,HalfBath,BsmtExposure_ord,...,Neighborhood,BldgType,PorchSF,BSMT_HighQual_bin,Fireplaces,Pool,BedroomAbvGr,ExterQual,OverallCond,KitchenQual
0,856,7890,0.428571,500-1000,71.210959,2.0,,1,0,1,...,SWISU,1Fam,166,No basement,1,0,2,0.333333,0.500000,0.333333
1,1049,4235,0.285714,0-500,25.104110,1.0,Brick Face,2,0,2,...,Edwards,TwnhsE,105,500-1000,0,0,2,0.666667,0.333333,0.666667
2,1039,8146,0.142857,0-500,109.402740,1.0,,1,0,1,...,OldTown,1Fam,279,No basement,0,0,2,0.666667,0.833333,0.333333
3,1665,8400,0.714286,0-500,8.838356,2.0,,2,1,1,...,NWAmes,1Fam,45,500-1000,0,0,3,0.666667,0.500000,0.666667
4,1922,7301,0.571429,No basement,6.501370,2.0,Brick Face,3,0,0,...,Edwards,1Fam,177,No basement,1,0,4,0.666667,0.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,952,8854,0.428571,500-1000,93.394521,1.0,,1,0,1,...,BrkSide,1Fam,138,No basement,1,0,2,0.333333,0.500000,0.000000
2467,1733,13680,0.000000,No basement,54.452055,2.0,,2,0,0,...,Edwards,1Fam,0,No basement,1,0,4,0.333333,0.333333,0.333333
2468,2002,6270,0.285714,1000-1500,58.619178,3.0,,2,0,1,...,Crawfor,Duplex,0,No basement,0,0,4,0.333333,0.500000,0.333333
2469,1842,8826,0.571429,0-500,7.501370,2.0,Brick Face,2,1,1,...,CollgCr,1Fam,96,500-1000,1,0,3,0.666667,0.333333,0.666667


In [617]:
non_dummies = [
    'MasVnrType',
    'Neighborhood',
    'BldgType',
    'BSMT_HighQual_bin',
    'BSMT_LowQual_bin'
]

In [618]:
dummies = [
    'Neighborhood_Blueste',
       'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr',
       'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_Greens', 'Neighborhood_GrnHill',
       'Neighborhood_IDOTRR', 'Neighborhood_Landmrk', 'Neighborhood_MeadowV',
       'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill',
       'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt',
       'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer',
       'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr',
       'Neighborhood_Timber', 'Neighborhood_Veenker', 'BldgType_2fmCon',
       'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE',
       'MasVnrType_None', 'MasVnrType_Stone',
    
        'BSMT_HighQual_bin_500-1000', 'BSMT_HighQual_bin_0-500',
    'BSMT_HighQual_bin_1000-1500', 'BSMT_HighQual_bin_1500+',
    'BSMT_LowQual_bin_0-500', 'BSMT_LowQual_bin_500-1000', 'BSMT_LowQual_bin_1000-1500',
    'BSMT_LowQual_bin_1500+'
]

In [620]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df = df.drop(columns = non_dummies, axis = 1)
    return df

In [621]:
df7.columns

Index(['GrLivArea', 'LotArea', 'OverallQual', 'BSMT_LowQual_bin',
       'house_age_years', 'GarageCars', 'MasVnrType', 'FullBath', 'HalfBath',
       'BsmtExposure_ord', 'SaleTypeNew', 'Neighborhood', 'BldgType',
       'PorchSF', 'BSMT_HighQual_bin', 'Fireplaces', 'Pool', 'BedroomAbvGr',
       'ExterQual', 'OverallCond', 'KitchenQual'],
      dtype='object')

In [622]:
df7 = dummify(df7, non_dummies, dummies)

In [623]:
lasso_tuner4 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner4.fit(df7, price_log)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [624]:
lasso_tuner4.cv_results_['mean_test_score']

array([0.93103317, 0.93108776, 0.93068733, 0.91015996])

In [531]:
lasso_tuner4.best_params_

{'alpha': 1e-06}

In [539]:
import pickle

In [536]:
lasso_tuner4.best_estimator_.predict(df7)

array([5.05790928, 5.10238662, 5.00562717, ..., 5.22816107, 5.34002457,
       5.32274894])

In [586]:
asdf = open('linear_model.txt', mode = 'wb')

In [587]:
asdf.close()

In [558]:
with open('linearmodel.pickle', mode = 'wb') as file:
    pickle.dump(lasso_tuner4.best_estimator_, file)

In [559]:
with open('linearmodel.pickle', mode = 'rb') as file:
    lm = pickle.load(file)

In [560]:
loaded_obj.predict(df7)

array([5.05790928, 5.10238662, 5.00562717, ..., 5.22816107, 5.34002457,
       5.32274894])

In [532]:
print(loc_feat_incl, ': ', max(lasso_tuner4.cv_results_['mean_test_score']), ', ', sum(vif_df['vif'])/len(vif_df))

['slipway', 'bar', 'farmyard', 'christian_catholic', 'jewish', 'muslim', 'garden_centre', 'christian_methodist', 'christian_evangelical', 'christian_lutheran'] :  0.9345469007364361 ,  36.054356259285186


['slipway', 'bar', 'cinema', 'supermarket', 'farmyard', 'christian_catholic', 'jewish', 'muslim', 'garden_centre', 'christian_methodist', 'christian_evangelical', 'christian_lutheran'] :  0.9350983215981801 ,  36.054356259285186

['slipway', 'bar', 'cinema', 'supermarket', 'farmyard', 'christian_catholic', 'jewish', 'muslim', 'garden_centre', 'christian_methodist', 'christian_evangelical', 'christian_lutheran'] :  0.9351894282916218 ,  36.054356259285186



In [533]:
feat_imp_min = pd.Series(data = lasso_tuner4.best_estimator_.coef_, index = df7.columns)
feat_imp_min = feat_imp_min.sort_values(ascending = False)
ignored_min = feat_imp_min[feat_imp_min == 0]
feat_imp_min = feat_imp_min[feat_imp_min != 0]
print(len(feat_imp_min))
print(feat_imp_min)
print(len(ignored_min))
print(ignored_min)

69
GrLivArea_log                  0.427147
OverallQual                    0.153860
OverallCond                    0.133836
BSMT_HighQual_bin_1500+        0.122679
LotArea_log                    0.101925
BSMT_HighQual_bin_1000-1500    0.083916
Neighborhood_Crawfor           0.054819
BSMT_LowQual_bin_1500+         0.051154
Neighborhood_StoneBr           0.048564
BSMT_HighQual_bin_500-1000     0.047971
BSMT_LowQual_bin_1000-1500     0.047562
garden_centre                  0.036461
KitchenQual                    0.034954
BSMT_LowQual_bin_500-1000      0.030745
ExterQual                      0.029386
Neighborhood_NridgHt           0.026092
Neighborhood_Blueste           0.026039
Neighborhood_NPkVill           0.024142
Neighborhood_Greens            0.023714
Neighborhood_NoRidge           0.020121
BSMT_HighQual_bin_0-500        0.020092
Neighborhood_Somerst           0.019603
BSMT_LowQual_bin_0-500         0.019339
SaleTypeNew                    0.018759
GarageCars                     0.0170

In [513]:
vif_min = pd.DataFrame()
vif_min['feature'] = df7.columns
vif_min['vif'] = [variance_inflation_factor(df7.values, i)
                          for i in range(len(df7.columns))]

In [514]:
print(sum(vif_min['vif'])/len(vif_min))
vif_min.sort_values(by = 'vif', ascending = False)

36.862252944353344


Unnamed: 0,feature,vif
0,GrLivArea_log,1181.094634
1,LotArea_log,898.646589
55,Neighborhood_NAmes,30.62592
12,BedroomAbvGr,28.824356
5,FullBath,25.182219
2,OverallQual,24.678437
28,christian_lutheran,23.071881
29,ExterQual,22.379857
62,Neighborhood_CollgCr,22.197046
3,house_age_years,19.249966


In [331]:
column_title_dict = {
    ### from original dataset
    'GrLivArea' : 'Above-ground living area in sq ft',
    'LotArea' : 'Lot area in sq ft',
    'OverallQual' : 'Overall quality',
    'BSMT_LowQual' : 'Low-quality basement area in sq ft',
    'BSMT_HighQual' : 'High-quality basement area in sq ft',
    'house_age_years' : 'House age in years',
    'GarageCars' : 'Number of cars held by garage',
    'FullBath' : 'Number of full bathrooms',
    'HalfBath' : 'Number of half-bathrooms',
    'BsmtExposure_ord' : 'Basement exposure',
    'Neighborhood' : 'Neighborhood',
    'BldgType' : 'Building type',
    'PorchSF' : 'Porch area in sq ft',
    'ExterQualDisc' : 'Exterior quality score - overall quality score',
    'OverallCondDisc' : 'Overall condition score - overall quality score',
    'KitchenQualDisc' : 'Kitchen quality score - overall quality score',
    'Fireplaces' : 'Number of fireplaces',
    'Pool' : 'Pool',
    'BedroomAbvGr' : 'Number of bedrooms',
    'ext_Asbestos_Shingles' : 'Asbestos used in walls',
    
    ### location features
    'graveyard' : 'Number graveyards within 1 mile',
    'police' : 'Number of police stations within 1 mile',
    'optician' : 'Number of opticians within 1 mile',
    'stop' : 'Number of stop signs within 1 mile',
    'slipway' : 'Number of slipways within 1 mile',
    'bar' : 'Number of bars within 1 mile',
    'cinema' : 'Number of cinemas within 1 mile',
    'supermarket' : 'Number of supermarkets within 1 mile',
    'hotel' : 'Number of hotels within 1 mile',
    'farmyard' : 'Number of farmyards within 1 mile',
    'water_tower' : 'Number of water towers within 1 mile',
    'christian_catholic' : 'Number of catholic churches within 1 mile',
    'jewish' : 'Number of synagogues within 1 mile',
    'muslim' : 'Number of mosques within 1 mile',
    'garden_centre' : 'Number of garden centers within 1 mile',
    'christian_lutheran' : 'Number of lutheran churches within 1 mile'
}