# This Notebook uses Lasso to guide feature selection.  
## Selected features are then run through OLS for interpretability

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
pd.options.display.max_columns = 50
import warnings
warnings.filterwarnings('ignore')


housing = pd.read_csv('housing_mar5.csv', index_col=0)
housing.shape

(2578, 178)

In [2]:
housetarg = pd.Series(housing.SalePrice_Log, name='target')
housefeature = housing.drop('SalePrice_Log',axis=1)

In [3]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(housefeature, housetarg, random_state=8)
Xtrain.shape, Xtest.shape

((1933, 177), (645, 177))

### Lasso for feature selection
- alpha=.0005

In [4]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.0005, normalize=True)
# lasso.get_params()

In [5]:
lasso.fit(housefeature,housetarg)

Lasso(alpha=0.0005, normalize=True)

In [6]:
print('Train score: ', lasso.score(Xtrain,ytrain))
print('Test score: ', lasso.score(Xtest,ytest))

Train score:  0.8903367659447151
Test score:  0.893375907302457


In [7]:
print('the intercept is: %.2f' %(lasso.intercept_))
coefs = pd.Series(abs(lasso.coef_), index=housefeature.columns)

the intercept is: 3.91


In [8]:
coefs.sort_values(ascending=False).head(21)

GrLivArea_Log    0.340627
OverallQual      0.083636
LotArea_Log      0.060683
CentralAir       0.042589
KitchenQual      0.029075
Fireplaces       0.022461
MSZoning__RM     0.018237
GarageCars       0.017368
ExterQual        0.015450
OverallCond      0.009230
HeatingQC        0.006735
BsmtQual         0.005944
PavedDrive       0.005268
BsmtExposure     0.002964
BsmtFinType1     0.001662
YearBuilt        0.001194
YearRemodAdd     0.000863
TotalBsmtSF      0.000083
GarageArea       0.000078
BsmtFinSF1       0.000073
MasVnrType__0    0.000000
dtype: float64

### OLS using 20 features identified by Lasso

In [46]:
housefeature.columns

Index(['LotArea_Log', 'Street_Paved', 'LotShape', 'LandContour', 'Utilities',
       'LandSlope', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       ...
       'FireplaceQu__2', 'FireplaceQu__4', 'FireplaceQu__5', 'FireplaceQu__TA',
       'GarageCond__0.0', 'GarageCond__1.0', 'GarageCond__2.0',
       'GarageCond__4.0', 'GarageCond__5.0', 'ISU_dist'],
      dtype='object', length=177)

In [113]:
#removed: 'GarageCars' 'MSZoning__RM',
housefeature_LM = housefeature[['GrLivArea_Log','OverallQual','LotArea_Log','CentralAir','KitchenQual',
                             'Fireplaces','HeatingQC','OverallCond',
                             'BsmtQual','PavedDrive','BsmtExposure','YearBuilt','YearRemodAdd',
                            'ExterQual','GarageArea','TotalBsmtSF','BsmtFinSF1','BsmtFinType1' 
                             ]]
housing_LM = housefeature_LM.merge(housetarg, how="inner", left_index=True, right_index=True)
housing_LM.to_csv('housing_LM.csv')

In [114]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(housefeature_LM, housetarg, random_state=8)
Xtrain.shape, Xtest.shape

((1933, 18), (645, 18))

In [115]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(Xtrain,ytrain)

LinearRegression()

### Check for fit

In [116]:
print(f'train score: {lm.score(Xtrain,ytrain):.3f}')
print(f'test score: {lm.score(Xtest,ytest):.3f}')

train score: 0.912
test score: 0.919


- fit is good

### Refit to the full dataset

In [117]:
lm.all = LinearRegression()
lm.all.fit(housefeature_LM, housetarg)

LinearRegression()

In [118]:
print('R^2 is equal to %.3f' %(lm.all.score(housefeature_LM,housetarg)))
#print('RSS is equal to %.3f' %(np.sum((housetarg-lm.predict(housefeature2))**2)))
print('The intercept is %.3f' %(lm.all.intercept_))
# print('The slopes are %s' %(lm.all.coef_))

R^2 is equal to 0.914
The intercept is 1.434


In [119]:
coefs = pd.Series(lm.all.coef_, index=housefeature_LM.columns)

In [120]:
coefs.sort_values(ascending=False).head(20)

GrLivArea_Log    0.389052
LotArea_Log      0.098805
OverallQual      0.069213
OverallCond      0.047838
CentralAir       0.041008
Fireplaces       0.038519
ExterQual        0.031465
KitchenQual      0.029637
PavedDrive       0.021341
HeatingQC        0.016745
BsmtExposure     0.013500
BsmtQual         0.009904
BsmtFinType1     0.004445
YearBuilt        0.002420
YearRemodAdd     0.000379
GarageArea       0.000128
TotalBsmtSF      0.000084
BsmtFinSF1       0.000077
dtype: float64

### Calculate RMSE

In [121]:
mse = np.mean((lm.all.predict(housefeature_LM)-housetarg)**2)
RMSE = np.sqrt(mse)
print(f'RMSE: {RMSE:.3f}')

RMSE: 0.114


### Interpreting the coefs where the dependent target is log transformed but the independent variable is not

In [122]:
import numpy as np
def exp_coef(coef):
    return (np.exp(coef)-1)*100
    
# exp_coef(.068919)

In [123]:
coef_exp = coefs.apply(lambda x: exp_coef(x))
coef_exp.sort_values(ascending=False)

GrLivArea_Log    47.558153
LotArea_Log      10.385097
OverallQual       7.166478
OverallCond       4.900031
CentralAir        4.186009
Fireplaces        3.927014
ExterQual         3.196554
KitchenQual       3.008039
PavedDrive        2.156994
HeatingQC         1.688602
BsmtExposure      1.359183
BsmtQual          0.995367
BsmtFinType1      0.445473
YearBuilt         0.242249
YearRemodAdd      0.037949
GarageArea        0.012759
TotalBsmtSF       0.008445
BsmtFinSF1        0.007711
dtype: float64

### Statsmodel

In [124]:
import statsmodels.api as sm 
X_add_const = sm.add_constant(housefeature_LM)
ols = sm.OLS(housetarg, housefeature_LM)
ans = ols.fit()
print(ans.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.584e+06
Date:                Sat, 06 Mar 2021   Prob (F-statistic):                        0.00
Time:                        23:51:17   Log-Likelihood:                          1942.7
No. Observations:                2578   AIC:                                     -3849.
Df Residuals:                    2560   BIC:                                     -3744.
Df Model:                          18                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------