In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
import time
import re
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
def dummify_linear(data, to_dummify, drops):
    
    bsmt_categoricals = [
        'BsmtCond_ord',
        'BsmtQual_ord',
        'BsmtExposure_ord']
    
    grg_categoricals = [ 
        'GarageQual',
        'GarageCond',
        'GarageType_com',
        'Garage_age_bin',
        'GarageFinish']
    
    dummies = to_dummify
        
    for item in drops:
        if item in grg_categoricals:
            grg_categoricals.remove(item)      
        if item in dummies:
            dummies.remove(item)
        if item in bsmt_categoricals:
            bsmt_categoricals.remove(item)
    
    df = data.drop(columns = drops, axis = 1)
    print('before dummifying: ', df.shape)
    df = pd.get_dummies(df, columns = grg_categoricals)
    for col in grg_categoricals:
        if f'{col}_0' in df.columns:
            df = df.drop(columns = [f'{col}_0'], axis = 1)
        if f'{col}_No garage' in df.columns:
            df = df.drop(columns = [f'{col}_No garage'], axis = 1)
    print('after garage stuff: ', df.shape)     
    df = pd.get_dummies(df, columns = bsmt_categoricals)
    for col in bsmt_categoricals:
        if f'{col}_0' in df.columns:
            df = df.drop(columns = [f'{col}_0'], axis = 1)
    print('after basement stuff: ', df.shape)
    
    df = pd.get_dummies(df, columns = to_dummify, drop_first = True)
    print('after full dummification: ', df.shape)
    
    return df

In [4]:
def drop_dummies(dummified, orig_col):
    keep_list = []
    for col in dummified.columns:
        if not f'{orig_col}_' in col:
            keep_list.append(col)
    return dummified.loc[:, keep_list]

In [5]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
#df = df[(df['Neighborhood'] != 'GrnHill') & (df['Neighborhood'] != 'Landmrk')]
df.reset_index(drop=True, inplace = True)

price = df.loc[:,'SalePrice']
price_log = df.loc[:,'SalePrice_log']

basement_categoricals = [
    'BsmtCond_ord',
    'BsmtQual_ord',
    'BsmtExposure_ord']

garage_categoricals = [ 
    'GarageQual',
    'GarageCond',
    'GarageType_com',
    'Garage_age_bin',
    'GarageFinish']

always_drop = [
    'Street_paved',
    'RoofMatl',
    'SaleType',
    'SaleCondition',
    'Garage_age_years',
    'Remod_age_years',
    'MoSold',
    'Utilities',
    'PID',
    'PoolArea',
    'PoolQC',
    'SalePrice',
    'sold_datetime',
    '2ndFlrSF_log',
    'GrLivArea',
    'MiscVal',
    'MiscFeature',
    '1stFlrSF',
    'LotArea',
    'LotFrontage_log',
    'YrSold'
]

droplist = []

to_dummify = [
    #'Street_paved',
    'Alley',
    'LandContour',
    #'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    'OverallQual',
    'OverallCond',
    'RoofStyle',
    #'RoofMatl',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'CentralAir',
    'KitchenQual',
    'FireplaceQu',
    'PavedDrive',
    #'PoolQC',
    'Fence',
    #'MiscFeature',
    #'MoSold',
    'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    'number_floors',
    'attic',
    'PUD',
    'Functional_ord',
    'Remod_age_bin'
    #'SaleType',
    #'SaleCondition'
]

In [6]:
all_dummies = to_dummify + garage_categoricals + basement_categoricals

In [7]:
df2 = df.drop(columns = always_drop, axis = 1)
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

In [8]:
# kfold = KFold(n_splits=5, shuffle = True, random_state = 1)
# folds = list(kfold.split(df3))
# xtrain = [0]*5
# ytrain = [0]*5
# xtest = [0]*5
# ytest = [0]*5
# for i in range(0,5):
#     xtrain[i] = df3.loc[folds[i][0],:].drop(columns = 'SalePrice_log')
#     ytrain[i] = df3.loc[folds[i][0],'SalePrice_log']
#     xtest[i] = df3.loc[folds[i][1],:].drop(columns = 'SalePrice_log')
#     ytest[i] = df3.loc[folds[i][1],'SalePrice_log']

# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total2={}
# j=1
# xtrain_red = xtrain.copy()
# xtest_red = xtest.copy()
# start = datetime.now()
# while len(feature_selection_df.columns)> 50:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             xtrain_red = list(map(lambda x: drop_dummies(x, col), xtrain))
#             xtest_red = list(map(lambda x: drop_dummies(x, col), xtest))
#         else:
#             xtrain_red = list(map(lambda x: x.drop(col, axis = 1), xtrain))
#             xtest_red = list(map(lambda x: x.drop(col, axis = 1), xtest))
            
#         #adjust these lines to reflect your model
#         models = list(map(lambda x, y: Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x, y), xtrain, ytrain))
#         scores = list(map(lambda x, y, z: x.score(y, z), models, xtest, ytest))
#         score_dict[col] = np.mean(scores)
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total2[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col, axis=1)
#     if min_col in all_dummies:
#         xtrain = list(map(lambda x: drop_dummies(x, min_col), xtrain))
#         xtest = list(map(lambda x: drop_dummies(x, min_col), xtest))
#     else:
#         xtrain = list(map(lambda x: x.drop(min_col, axis = 1), xtrain))
#         xtest = list(map(lambda x: x.drop(min_col, axis = 1), xtest))
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [9]:
# col_dict_total2

In [10]:
# folds = list(kfold.split(df3))
# xtrain = [0]*5
# ytrain = [0]*5
# xtest = [0]*5
# ytest = [0]*5
# for i in range(0,5):
#     xtrain[i] = df3.loc[folds[i][0],:].drop(columns = 'SalePrice_log')
#     ytrain[i] = df3.loc[folds[i][0],'SalePrice_log']
#     xtest[i] = df3.loc[folds[i][1],:].drop(columns = 'SalePrice_log')
#     ytest[i] = df3.loc[folds[i][1],'SalePrice_log']

# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total_neighborhood={}
# j=1
# xtrain_red = xtrain.copy()
# xtest_red = xtest.copy()
# start = datetime.now()
# while len(feature_selection_df.columns)> 50:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             xtrain_red = list(map(lambda x: drop_dummies(x, col), xtrain))
#             xtest_red = list(map(lambda x: drop_dummies(x, col), xtest))
#         else:
#             xtrain_red = list(map(lambda x: x.drop(col, axis = 1), xtrain))
#             xtest_red = list(map(lambda x: x.drop(col, axis = 1), xtest))
            
#         #adjust these lines to reflect your model
#         models = list(map(lambda x, y: Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x, y), xtrain, ytrain))
#         scores = list(map(lambda x, y, z: x.score(y, z), models, xtest, ytest))
#         score_dict[col] = np.mean(scores)
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total_neighborhood[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col, axis=1)
#     if min_col in all_dummies:
#         xtrain = list(map(lambda x: drop_dummies(x, min_col), xtrain))
#         xtest = list(map(lambda x: drop_dummies(x, min_col), xtest))
#     else:
#         xtrain = list(map(lambda x: x.drop(min_col, axis = 1), xtrain))
#         xtest = list(map(lambda x: x.drop(min_col, axis = 1), xtest))
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [11]:
# col_dict_total_neighborhood

In [12]:
# score_dict_total

In [13]:
# train, test = train_test_split(df3, test_size = 0.2, shuffle = True, random_state = 0)
# x_train = train.drop(columns = 'SalePrice_log')
# x_test = test.drop(columns = 'SalePrice_log')
# y_train = train['SalePrice_log']
# y_test = test['SalePrice_log']

In [14]:
# x_train2, x_test2, y_train2, y_test2 = x_train.copy(), x_test.copy(), y_train.copy(), y_test.copy()
# feature_selection_df = df2.drop(columns = ['SalePrice_log'], axis = 1)
# score_dict_total={}
# col_dict_total={}
# j=1
# start = datetime.now()
# while len(feature_selection_df.columns)> 20:
#     score_dict={}
#     for col in feature_selection_df.columns:
#         if col in all_dummies:
#             x_train_red = drop_dummies(x_train2, col)
#             x_test_red = drop_dummies(x_test2, col)
#         else:
#             x_train_red = x_train2.drop(col, axis = 1)
#             x_test_red = x_test2.drop(col, axis = 1)
#         #adjust these lines to reflect your model
#         model = Lasso(alpha = 1e-6, normalize = True, max_iter = 1000, tol = 0.001).fit(x_train_red, y_train2)
#         score=model.score(x_test_red, y_test2)
#         score_dict[col]=score
#         #-------

#     min_col=max(score_dict.items(), key=lambda x: x[1])[0]
#     score_dict_total[j]=max(score_dict.items(), key=lambda x: x[1])[1]
#     col_dict_total[j]=min_col
#     feature_selection_df = feature_selection_df.drop(min_col,axis=1)
#     if min_col in all_dummies:
#         x_train2 = drop_dummies(x_train2, min_col)
#         x_test2 = drop_dummies(x_test2, min_col)
#     else:
#         x_train2 = x_train2.drop(min_col, axis = 1)
#         x_test2 = x_test2.drop(min_col, axis = 1)
#     print(f'{j} columns removed: {min_col}, best score is {score_dict_total[j]}; time elapsed is {datetime.now()-start}')
#     j+=1

In [15]:
droplist = ['TotalBsmtSF',
            'BsmtCond_ord',
            'BsmtQual_ord',
            'GarageCond',
            'GarageQual',
            'GarageType_com',
            'SalePrice_log',
            'Garage_age_bin',
            'Remod_age_bin',
            '1stFlrSF_log',
            '2ndFlrSF',
            'KitchenAbvGr',
            'TotRmsAbvGrd',
            'GarageArea',
            'GarageFinish',
            'Fence',
            'Alley',
            'number_floors',
            'FireplaceQu',
            'LotFrontage',
            'LowQualFinSF',
            'BsmtExposure_ord',
            'MasVnrArea'
           ]

droplist3 = ['SalePrice_log']

In [16]:
df3 = dummify_linear(df2, to_dummify, droplist)

before dummifying:  (2471, 58)
after garage stuff:  (2471, 58)
after basement stuff:  (2471, 58)
after full dummification:  (2471, 124)


In [17]:
params_log = {'alpha' : [1e-7, 1e-6, 1e-5, 1e-4]
          }

In [18]:
lasso2 = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
lasso_tuner2 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner2.fit(df3, price_log)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [19]:
#lasso_tuner2.cv_results_

In [20]:
lasso_tuner2.cv_results_['mean_test_score']

array([0.94744747, 0.94757162, 0.94470538, 0.91400025])

In [21]:
lasso_tuner2.cv_results_['mean_train_score']

array([0.95496864, 0.95486537, 0.95097223, 0.91901587])

In [22]:
feat_imp_log = pd.Series(data = lasso_tuner2.best_estimator_.coef_, index = df3.columns)
feat_imp_log = feat_imp_log.sort_values(ascending = False)
ignored_log = feat_imp_log[feat_imp_log == 0]
feat_imp_log = feat_imp_log[feat_imp_log != 0]
print(len(feat_imp_log))
print(feat_imp_log)
print(len(ignored_log))
print(ignored_log)

118
GrLivArea_log                                                 0.424990
OverallQual_10                                                0.184668
Neighborhood_GrnHill                                          0.181966
OverallQual_9                                                 0.152077
OverallCond_9                                                 0.135713
OverallCond_8                                                 0.120620
OverallCond_7                                                 0.108497
OverallQual_8                                                 0.105630
LotArea_log                                                   0.098563
OverallCond_6                                                 0.084857
OverallQual_7                                                 0.074798
OverallCond_5                                                 0.068410
Functional_ord_7                                              0.067255
OverallQual_6                                                 0.055466
Fu

In [23]:
# preds_log = lasso_tuner2.predict(df4)

# fig = px.scatter(x = preds_log, y = price_log, hover_name = df['PID'])
# fig.update_layout(
#                     xaxis_title = 'predicted',
#                     yaxis_title = 'actual'
#                 )

In [24]:
# resid = abs(preds_log - price_log)

# fig = px.histogram(x = resid)
# fig.update_layout(
#                     xaxis_title = 'residuals'
#                 )

In [25]:
# 10**np.mean(price_log)*(1-lasso_tuner2.best_score_)

In [26]:
df4 = dummify_linear(df2, to_dummify, droplist)

before dummifying:  (2471, 58)
after garage stuff:  (2471, 58)
after basement stuff:  (2471, 58)
after full dummification:  (2471, 124)


In [27]:
vif_df = pd.DataFrame()
vif_df['feature'] = df4.columns
vif_df['vif'] = [variance_inflation_factor(df4.values, i)
                          for i in range(len(df4.columns))]

In [28]:
vif_df.sort_values(by = 'vif', ascending = False)

Unnamed: 0,feature,vif
32,GrLivArea_log,1778.268217
18,LotArea_log,1091.801677
107,"MSZoning_com_Residential, low density",110.706511
123,Functional_ord_7,107.26729
87,ExterQual_3,86.996665
119,PUD_1,69.835688
113,"Electrical_com_Standard circuit breakers, all ...",68.648088
90,ExterCond_3,59.048342
88,ExterQual_4,50.074533
69,BldgType_TwnhsE,45.904481


In [29]:
print('No radials: ', droplist, ': ', lasso_tuner2.best_params_['alpha'], ', ', max(lasso_tuner2.cv_results_['mean_test_score']), ', ', sum(vif_df['vif'])/len(vif_df))

No radials:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea'] :  1e-06 ,  0.9475716176776527 ,  36.054356259285186


log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin'] :  1e-06 ,  0.943454281780628

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log'] :  1e-06 ,  0.9424740060473317

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF'] :  1e-06 ,  0.9420770647332499

log:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr'] :  1e-06 ,  0.9417616764602397

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea'] :  1e-06 ,  0.9416195613781179 ,  39.9001979416710

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence'] :  1e-06 ,  0.9417193728756516 ,  41.05487105565105

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence'] :  1e-06 ,  0.9416530339287166 ,  39.99237976054064

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley'] :  1e-06 ,  0.9417665575176081 ,  41.1929734948757

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors'] :  1e-06 ,  0.9418035097906687 ,  39.559875261890845

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu'] :  1e-06 ,  0.9417500104695516 ,  40.30945260646638

['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea'] :  1e-06 ,  0.9468407408759749 ,  36.57884336374308

No radials:  ['TotalBsmtSF', 'BsmtCond_ord', 'BsmtQual_ord', 'GarageCond', 'GarageQual', 'GarageType_com', 'SalePrice_log', 'Garage_age_bin', 'Remod_age_bin', '1stFlrSF_log', '2ndFlrSF', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageArea', 'GarageFinish', 'Fence', 'Alley', 'number_floors', 'FireplaceQu', 'LotFrontage', 'LowQualFinSF', 'BsmtExposure_ord', 'MasVnrArea', 'LotShape_com'] :  1e-06 ,  0.946833771716485 ,  35.69657081462711

In [32]:
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)

rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]

radial.drop(columns = rad_drops, inplace = True)
sub = df.loc[:,['PID', 'SalePrice_log']]
radial = pd.merge(radial, sub, how = 'right', on = 'PID')

In [33]:
radial.drop(columns = ['PID','SalePrice_log'], inplace = True)

In [34]:
lasso_tuner3 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner3.fit(radial, price_log)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [35]:
lasso_tuner3.cv_results_['mean_test_score']

array([0.54590049, 0.54646727, 0.54404021, 0.46670631])

In [36]:
lasso_tuner3.cv_results_['mean_train_score']

array([0.58646805, 0.58634686, 0.57780744, 0.48220605])

In [37]:
len(radial.columns)

81

In [38]:
feat_imp_rad = pd.Series(data = lasso_tuner3.best_estimator_.coef_, index = radial.columns)
feat_imp_rad = feat_imp_rad.sort_values(ascending = False)
ignored_rad = feat_imp_rad[feat_imp_rad == 0]
feat_imp_rad = feat_imp_rad[feat_imp_rad != 0]
print(len(feat_imp_rad))
print(feat_imp_rad)
print(len(ignored_rad))
print(ignored_rad)

81
slipway                  0.166635
garden_centre            0.157494
guesthouse               0.101396
stop                     0.083934
library                  0.073591
dam                      0.059375
furniture_shop           0.050300
christian_catholic       0.045075
stationery               0.040657
viewpoint                0.040383
laundry                  0.039904
tourist_info             0.039035
pier                     0.037632
theatre                  0.036032
recreation_ground        0.034843
pub                      0.034486
dog_park                 0.032927
water_tower              0.031247
doctors                  0.029168
apron                    0.026762
scrub                    0.025278
hotel                    0.024577
shoe_shop                0.021256
jewish                   0.019733
pharmacy                 0.017854
christian_lutheran       0.017127
playground               0.016967
fire_station             0.015375
doityourself             0.014012
nursing_hom

In [39]:
vif_rad = pd.DataFrame()
vif_rad['feature'] = radial.columns
vif_rad['vif'] = [variance_inflation_factor(radial.values, i)
                          for i in range(len(radial.columns))]

In [40]:
print(sum(vif_rad['vif'])/len(vif_rad))
vif_rad.sort_values(by = 'vif', ascending = False)

32.2462129077583


Unnamed: 0,feature,vif
15,playground,116.693882
21,restaurant,115.125299
69,park,102.889255
24,bar,96.291573
43,bank,94.572975
62,parking_multistorey,87.753351
68,forest,84.649819
44,atm,83.004114
61,fuel,76.037614
27,shelter,71.530917


In [41]:
df6 = df.copy()

In [42]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def fit_scale(df, col):
    scaler.fit(df[[col]])
    df[[col]]=scaler.transform(df[[col]])
fit_scale(df6, 'OverallQual')
fit_scale(df6, 'ExterQual')
fit_scale(df6, 'OverallCond')
fit_scale(df6, 'KitchenQual')
#df2['Porch']=((df2['OpenPorchSF']>0) | (df2['EnclosedPorch']>0) | (df2['3SsnPorch']>0) | (df2['ScreenPorch']>0))
df6['PorchSF']=df6['OpenPorchSF']+df6['EnclosedPorch']+df6['3SsnPorch']+df6['ScreenPorch']
#df2['1stFloorArea%']=df2['1stFlrSF']/df2['GrLivArea']
#df2['2ndFloorArea%']=df2['2ndFlrSF']/df2['GrLivArea']
df6['ExterQualDisc']=df6['OverallQual']-df6['ExterQual']
df6['OverallCondDisc']=df6['OverallQual']-df6['OverallCond']
df6['KitchenQualDisc']=df6['OverallQual']-df6['KitchenQual']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ']/df2['TotalBsmtSF']
#df2['BSMT_ALQ%']=df2['BSMT_ALQ']/df2['TotalBsmtSF']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ%'].fillna(0)
#df2['BSMT_ALQ%']=df2['BSMT_ALQ%'].fillna(0)
df6['BSMT_LowQual']=df6['TotalBsmtSF']-df6['BSMT_GLQ']-df6['BSMT_ALQ']
df6['BSMT_HighQual']=df6['BSMT_GLQ']+df6['BSMT_ALQ']

In [43]:
df6['BSMT_HighQual_bin'] = pd.cut(df6['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 

In [44]:
df6['BSMT_LowQual_bin'] = pd.cut(df6['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 

In [45]:
feat_incl = ['GrLivArea_log', 'LotArea_log', 'OverallQual',
    'BSMT_LowQual_bin', 'house_age_years', 'GarageCars','MasVnrType',
                 'FullBath','HalfBath',
                'BsmtExposure_ord',
                 'Neighborhood',
                 'BldgType','PorchSF',
                 'ExterQualDisc','OverallCondDisc','BSMT_HighQual_bin',
                 'KitchenQualDisc',
                'Fireplaces','Pool','BedroomAbvGr'
]

In [46]:
df7 = df6.loc[:,feat_incl]
df7

Unnamed: 0,GrLivArea_log,LotArea_log,OverallQual,BSMT_LowQual_bin,house_age_years,GarageCars,MasVnrType,FullBath,HalfBath,BsmtExposure_ord,Neighborhood,BldgType,PorchSF,ExterQualDisc,OverallCondDisc,BSMT_HighQual_bin,KitchenQualDisc,Fireplaces,Pool,BedroomAbvGr
0,2.932474,3.897077,0.428571,500-1000,71.210959,2.0,,1,0,1,SWISU,1Fam,166,0.095238,-0.071429,No basement,0.095238,1,0,2
1,3.020775,3.626853,0.285714,0-500,25.104110,1.0,Brick Face,2,0,2,Edwards,TwnhsE,105,-0.380952,-0.047619,500-1000,-0.380952,0,0,2
2,3.016616,3.910944,0.142857,0-500,109.402740,1.0,,1,0,1,OldTown,1Fam,279,-0.523810,-0.690476,No basement,-0.190476,0,0,2
3,3.221414,3.924279,0.714286,0-500,8.838356,2.0,,2,1,1,NWAmes,1Fam,45,0.047619,0.214286,500-1000,0.047619,0,0,3
4,3.283753,3.863382,0.571429,No basement,6.501370,2.0,Brick Face,3,0,0,Edwards,1Fam,177,-0.095238,0.238095,No basement,-0.095238,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,2.978637,3.947140,0.428571,500-1000,93.394521,1.0,,1,0,1,BrkSide,1Fam,138,0.095238,-0.071429,No basement,0.428571,1,0,2
2467,3.238799,4.136086,0.000000,No basement,54.452055,2.0,,2,0,0,Edwards,1Fam,0,-0.333333,-0.333333,No basement,-0.333333,1,0,4
2468,3.301464,3.797268,0.285714,1000-1500,58.619178,3.0,,2,0,1,Crawfor,Duplex,0,-0.047619,-0.214286,No basement,-0.047619,0,0,4
2469,3.265290,3.945764,0.571429,0-500,7.501370,2.0,Brick Face,2,1,1,CollgCr,1Fam,96,-0.095238,0.238095,500-1000,-0.095238,1,0,3


In [47]:
new_dummies = [
    'MasVnrType',
    'Neighborhood',
    'BldgType',
    'BSMT_HighQual_bin',
    'BSMT_LowQual_bin'
]

In [48]:
df7 = pd.get_dummies(df7, columns = new_dummies, drop_first = True)

In [49]:
lasso_tuner4 = GridSearchCV(lasso2, params_log, cv=kfold, return_train_score = True)
lasso_tuner4.fit(df7, price_log)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Lasso(normalize=True, tol=0.001),
             param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [50]:
lasso_tuner4.cv_results_['mean_test_score']

array([0.93478798, 0.93486576, 0.93440456, 0.90907684])

In [53]:
feat_imp_min = pd.Series(data = lasso_tuner4.best_estimator_.coef_, index = df7.columns)
feat_imp_min = feat_imp_min.sort_values(ascending = False)
ignored_min = feat_imp_min[feat_imp_min == 0]
feat_imp_min = feat_imp_min[feat_imp_min != 0]
print(len(feat_imp_min))
print(feat_imp_min)
print(len(ignored_min))
print(ignored_min)

54
GrLivArea_log                  0.428595
OverallQual                    0.355797
Neighborhood_GrnHill           0.195895
BSMT_HighQual_bin_1500+        0.123480
LotArea_log                    0.100682
BSMT_HighQual_bin_1000-1500    0.083587
BSMT_LowQual_bin_1500+         0.050424
Neighborhood_Crawfor           0.050064
Neighborhood_StoneBr           0.048123
BSMT_LowQual_bin_1000-1500     0.047783
BSMT_HighQual_bin_500-1000     0.047754
Neighborhood_NridgHt           0.033312
Neighborhood_Greens            0.032040
BSMT_LowQual_bin_500-1000      0.029855
Neighborhood_Somerst           0.028410
Neighborhood_NoRidge           0.027968
BSMT_HighQual_bin_0-500        0.019793
Neighborhood_NPkVill           0.019305
BSMT_LowQual_bin_0-500         0.018307
GarageCars                     0.017095
Fireplaces                     0.012302
Neighborhood_ClearCr           0.010282
Neighborhood_Blueste           0.007866
BsmtExposure_ord               0.007222
Neighborhood_BrkSide           0.0069

In [51]:
vif_min = pd.DataFrame()
vif_min['feature'] = df7.columns
vif_min['vif'] = [variance_inflation_factor(df7.values, i)
                          for i in range(len(df7.columns))]

In [52]:
vif_min.sort_values(by = 'vif', ascending = False)

Unnamed: 0,feature,vif
0,GrLivArea_log,1114.1371
1,LotArea_log,862.800588
2,OverallQual,33.245155
14,BedroomAbvGr,28.422996
5,FullBath,24.808349
31,Neighborhood_NAmes,23.593342
3,house_age_years,16.205276
36,Neighborhood_OldTown,14.078907
4,GarageCars,13.89336
21,Neighborhood_CollgCr,13.395175
