# This Notebook uses Lasso to guide feature selection.  
## Selected features are then run through OLS for interpretability

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
pd.options.display.max_columns = 50
import warnings
warnings.filterwarnings('ignore')


housing = pd.read_csv('housing_mar5.csv', index_col=0)
housing.shape

(2578, 178)

In [2]:
housetarg = pd.Series(housing.SalePrice_Log, name='target')
housefeature = housing.drop('SalePrice_Log',axis=1)

In [3]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(housefeature, housetarg, random_state=8)
Xtrain.shape, Xtest.shape

((1933, 177), (645, 177))

### Lasso for feature selection
- alpha=.0005

In [4]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.0005, normalize=True)
# lasso.get_params()

In [5]:
lasso.fit(housefeature,housetarg)

Lasso(alpha=0.0005, normalize=True)

In [6]:
print('Train score: ', lasso.score(Xtrain,ytrain))
print('Test score: ', lasso.score(Xtest,ytest))

Train score:  0.8903367659447151
Test score:  0.893375907302457


In [7]:
print('the intercept is: %.2f' %(lasso.intercept_))
coefs = pd.Series(abs(lasso.coef_), index=housefeature.columns)

the intercept is: 3.91


In [8]:
coefs.sort_values(ascending=False).head(21)

GrLivArea_Log    0.340627
OverallQual      0.083636
LotArea_Log      0.060683
CentralAir       0.042589
KitchenQual      0.029075
Fireplaces       0.022461
MSZoning__RM     0.018237
GarageCars       0.017368
ExterQual        0.015450
OverallCond      0.009230
HeatingQC        0.006735
BsmtQual         0.005944
PavedDrive       0.005268
BsmtExposure     0.002964
BsmtFinType1     0.001662
YearBuilt        0.001194
YearRemodAdd     0.000863
TotalBsmtSF      0.000083
GarageArea       0.000078
BsmtFinSF1       0.000073
MasVnrType__0    0.000000
dtype: float64

### OLS using 20 features identified by Lasso

In [9]:
housefeature_LM = housefeature[['GrLivArea_Log','OverallQual','LotArea_Log','CentralAir','KitchenQual',
                             'Fireplaces','MSZoning__RM','GarageCars','ExterQual','OverallCond','HeatingQC',
                             'BsmtQual','PavedDrive','BsmtExposure','BsmtFinType1','YearBuilt','YearRemodAdd',
                             'TotalBsmtSF','GarageArea','BsmtFinSF1']]

In [19]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(housefeature_LM, housetarg, random_state=8)
Xtrain.shape, Xtest.shape

((1933, 20), (645, 20))

In [25]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(Xtrain,ytrain)

LinearRegression()

### Check for fit

In [39]:
print(f'train score: {lm.score(Xtrain,ytrain):.3f}')
print(f'test score: {lm.score(Xtest,ytest):.3f}')

train score: 0.913
test score: 0.920


- fit is good

### Refit to the full dataset

In [42]:
lm.all = LinearRegression()
lm.all.fit(housefeature_LM, housetarg)

LinearRegression()

In [44]:
print('R^2 is equal to %.3f' %(lm.all.score(housefeature_LM,housetarg)))
#print('RSS is equal to %.3f' %(np.sum((housetarg-lm.predict(housefeature2))**2)))
print('The intercept is %.3f' %(lm.all.intercept_))
print('The slopes are %s' %(lm.all.coef_))

R^2 is equal to 0.915
The intercept is 2.147
The slopes are [ 3.89075037e-01  6.89192364e-02  8.58536505e-02  4.42672911e-02
  3.02570552e-02  3.58055126e-02 -4.20048887e-02  2.20801974e-02
  3.15289684e-02  4.76306809e-02  1.71754442e-02  1.03074386e-02
  1.97055558e-02  1.47913220e-02  3.86193672e-03  2.09126618e-03
  4.02913870e-04  8.38569561e-05  7.07488930e-05  7.93734890e-05]


In [45]:
coefs = pd.Series(lm.all.coef_, index=housefeature_LM.columns)

In [46]:
coefs.sort_values(ascending=False).head(20)

GrLivArea_Log    0.389075
LotArea_Log      0.085854
OverallQual      0.068919
OverallCond      0.047631
CentralAir       0.044267
Fireplaces       0.035806
ExterQual        0.031529
KitchenQual      0.030257
GarageCars       0.022080
PavedDrive       0.019706
HeatingQC        0.017175
BsmtExposure     0.014791
BsmtQual         0.010307
BsmtFinType1     0.003862
YearBuilt        0.002091
YearRemodAdd     0.000403
TotalBsmtSF      0.000084
BsmtFinSF1       0.000079
GarageArea       0.000071
MSZoning__RM    -0.042005
dtype: float64

### Calculate RMSE

In [49]:
mse = np.mean((lm.all.predict(housefeature_LM)-housetarg)**2)
RMSE = np.sqrt(mse)
print(f'RMSE: {RMSE:.3f}')

RMSE: 0.113


### Interpreting the coefs where the dependent target is log transformed but the independent variable is not

In [76]:
import numpy as np
def exp_coef(coef):
    return (np.exp(coef)-1)*100
    
# exp_coef(.068919)

In [77]:
coef_exp = coefs.apply(lambda x: exp_coef(x))
coef_exp

GrLivArea_Log    47.561527
OverallQual       7.134968
LotArea_Log       8.964685
CentralAir        4.526171
KitchenQual       3.071945
Fireplaces        3.645425
MSZoning__RM     -4.113491
GarageCars        2.232577
ExterQual         3.203127
OverallCond       4.878325
HeatingQC         1.732379
BsmtQual          1.036074
PavedDrive        1.990099
BsmtExposure      1.490125
BsmtFinType1      0.386940
YearBuilt         0.209345
YearRemodAdd      0.040300
TotalBsmtSF       0.008386
GarageArea        0.007075
BsmtFinSF1        0.007938
dtype: float64