Running Elastic net on the Ames, Iowa Housing dataset
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data


Data Cleaning (Reduced Pipeline Version)
Helper Functions

    

In [175]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from joblib import dump, load

In [176]:
# %run -i '../jen/feature_engineering.py'
#train = pd.read_csv("../jen/house-prices-advanced-regression-techniques/train.csv")

In [177]:


train = pd.read_csv("../jen/house-prices-advanced-regression-techniques/train.csv")
train.columns
train.set_index("Id", inplace=True)

train = train.loc[train['GrLivArea'] < 4500] # outlier removal

train['FullBath'] = train['FullBath'] + train['BsmtFullBath']
train['HalfBath'] = train['HalfBath'] + train['BsmtHalfBath']
train['Total_porch_sf'] = (train['OpenPorchSF'] + train['3SsnPorch'] +train['EnclosedPorch'] + train['ScreenPorch'] + train['WoodDeckSF'])

train.drop(['FireplaceQu', 'Street', 'Utilities', 'LandContour', 'MasVnrType',
	'Condition2', 'PoolArea', 'LotFrontage', 'CentralAir', 'Functional',
	'LandSlope', 'LotConfig', 'Fence', 'BldgType', 'Street', 'Electrical',
	'Alley', 'RoofStyle', 'KitchenAbvGr', 'BsmtFinType2', 'Heating',
	'PavedDrive', 'LandContour', 'Condition1', 'GarageCond', 'ExterCond',
	'MSZoning', 'MiscFeature', 'SaleCondition', 'BsmtFinSF2', 'SaleType',
	'BsmtCond', 'MiscVal', 'GarageQual','EnclosedPorch','3SsnPorch',
	'RoofMatl', 'ScreenPorch', 'Condition2', 'PoolQC', 
	'PoolArea', 'BsmtHalfBath', 'BsmtFullBath', 'BsmtQual', 'BsmtExposure',
	'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF'], 
	axis=1, inplace=True)


train.drop(['MSSubClass', 'LotShape', 'Neighborhood', 'YearRemodAdd',
    'Exterior1st', 'Exterior2nd', 'GarageType', 'GarageYrBlt', 'Foundation',
	'GarageFinish', 'GarageCars','OpenPorchSF','WoodDeckSF'], axis=1, inplace=True)


train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['TotSF'] = train['GrLivArea']-train['LowQualFinSF'] + train['TotalBsmtSF']
train.drop(['GrLivArea', 'LowQualFinSF', 'TotalBsmtSF'], axis=1, inplace=True)
train.to_csv('../train_clean.csv', index=False)



#Dummify
HS_dummy = pd.get_dummies(train['HouseStyle'], prefix='HS', prefix_sep='__', drop_first = True)
train = pd.concat([train.drop('HouseStyle', axis=1), HS_dummy], axis=1)
EQ_dummy = pd.get_dummies(train['ExterQual'], prefix='EQ', prefix_sep='__', drop_first = True)
train = pd.concat([train.drop('ExterQual', axis=1), EQ_dummy], axis=1)
HQC_dummy = pd.get_dummies(train['HeatingQC'], prefix='HQC', prefix_sep='__',drop_first = True)
train = pd.concat([train.drop('HeatingQC', axis=1), HQC_dummy], axis=1)
KQ_dummy = pd.get_dummies(train['KitchenQual'], prefix='KQ', prefix_sep='__',drop_first = True)
train = pd.concat([train.drop('KitchenQual', axis=1), KQ_dummy], axis=1)
## 
print(train.columns)


Index(['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'MasVnrArea',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageArea', 'MoSold', 'YrSold', 'SalePrice', 'Total_porch_sf',
       'TotSF', 'HS__1.5Unf', 'HS__1Story', 'HS__2.5Fin', 'HS__2.5Unf',
       'HS__2Story', 'HS__SFoyer', 'HS__SLvl', 'EQ__Fa', 'EQ__Gd', 'EQ__TA',
       'HQC__Fa', 'HQC__Gd', 'HQC__Po', 'HQC__TA', 'KQ__Fa', 'KQ__Gd',
       'KQ__TA'],
      dtype='object')


Helper funcs

In [178]:
def get_error(X_train, y_train, X_test, y_test, model, show = True):
    """
    This function takes a train/testing split and returns a list of 
    training error and testing error
    """
    model.fit(X_train, y_train)
    train_error = 1 - model.score(X_train, y_train)
    test_error  = 1 - model.score(X_test, y_test)
    return [train_error, test_error]


def get_zeros(data_frame):
    """
    This Function gives the percantage of Zero's after running 
    Lasso with many coefficients 
    """
    to_zero_coef = []
    to_zero_names = []
    for i in range(len(data_frame.columns)):
        if 0 in data_frame[data_frame.columns[i]].value_counts():
            to_zero_names.append(data_frame.columns[i])
            to_zero_coef.append(data_frame[data_frame.columns[i]].value_counts().loc[0]/len(data_frame[data_frame.columns[i]]))

        else:
            pass

    all_zeros = list(zip(to_zero_coef,to_zero_names))
    all_zeros = sorted(all_zeros, key = lambda x: x[0],reverse=True)
    return all_zeros



Run Ridge and Lasso

In [179]:
#Y = train.loc[:,'SalePrice']
Y = np.log1p(train.SalePrice)
X = train.drop(['SalePrice'], axis=1)

K-Fold Cross Validation

In [180]:
from sklearn.model_selection import RepeatedKFold
#KFold(n_splits=5, shuffle=False, random_state=None)



kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1) 
for train_index, test_index in kf.split(X):
      #print("Train:", train_index, "Validation:", test_index)
      X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
      y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

Set up Models and standardize

In [181]:
#Models
lasso = Lasso(alpha = 1e-4)
ridge = Ridge()
lm = LinearRegression()


ss = StandardScaler()
X_train = pd.DataFrame( ss.fit_transform( X_train ), columns = X_train.columns )
X_test = pd.DataFrame( ss.transform( X_test ), columns = X_test.columns )


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  if __name__ == '__main__':


Run Base Model after Rigorous Feature Selection

In [182]:
lm1 = get_error(X_train,y_train, X_test, y_test, lm ,show=True)
ridge1 = get_error(X_train,y_train, X_test, y_test, ridge ,show=True)
lasso1 = get_error(X_train,y_train, X_test, y_test, lasso ,show=True)

model_df = pd.DataFrame({"Model":['Linear', 'Ridge', 'Lasso (e-4)'],\
                         "Train Error" :[lm1[0], ridge1[0], lasso1[0]],\
                         "Test Error":[lm1[1], ridge1[1], lasso1[1]]})


In [183]:
model_df = pd.DataFrame({"Model":['Linear', 'Ridge', 'Lasso (e-4)'],\
                         "Train Error" :[lm1[0], ridge1[0], lasso1[0]],\
                         "Test Error":[lm1[1], ridge1[1], lasso1[1]]})

In [184]:
model_df

Unnamed: 0,Model,Train Error,Test Error
0,Linear,0.100432,0.118274
1,Ridge,0.100433,0.118202
2,Lasso (e-4),0.100437,0.118102


In [185]:
lasso  = Lasso()
alphas = np.linspace(1e-10,1e-4,100)
lasso.set_params(normalize=True)
coefs_lasso  = []
means_squared = []

for alpha in alphas:
        lasso.set_params(alpha=alpha)
        lasso.fit(X_train, y_train)  
        coefs_lasso.append(lasso.coef_)
        means_squared.append(mean_squared_error(lasso.predict(X_train),y_train))
        
mse_vs_lambda = pd.DataFrame(np.sqrt(means_squared), index = alphas, columns = ["RSS"])
coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = X_train.columns)  
pd.set_option('display.max_columns', None)
get_zeros(coefs_lasso)
#mse_vs_lambda

[(1.0, 'HQC__Po'),
 (0.97, 'HS__1.5Unf'),
 (0.76, 'EQ__Gd'),
 (0.66, 'HS__1Story'),
 (0.31, 'MoSold'),
 (0.21, 'HS__2Story'),
 (0.17, 'HS__SFoyer'),
 (0.15, 'BedroomAbvGr'),
 (0.1, 'HS__2.5Unf')]

In [186]:
# drop cols with high Zeros' percentage for small lambda coefs
X_train.drop(['HS__1.5Unf','HQC__Po','EQ__Gd','HS__1Story','HS__2Story'],axis=1,inplace=True)
X_test.drop(['HS__1.5Unf','HQC__Po','EQ__Gd','HS__1Story','HS__2Story'],axis=1,inplace=True)

KeyError: "['HS__1.5Unf' 'HQC__Po' 'EQ__Gd' 'HS__1Story' 'HS__2Story'] not found in axis"

In [188]:
lm2 = get_error(X_train,y_train, X_test, y_test, lm ,show=True)
ridge2 = get_error(X_train,y_train, X_test, y_test, ridge ,show=True)
lasso2 = get_error(X_train,y_train, X_test, y_test, lasso ,show=True)
model_df = pd.DataFrame({"Model":['Linear', 'Ridge', 'Lasso (e-4)'],\
                         "Train Error" :[lm2[0], ridge2[0], lasso2[0]],\
                         "Test Error":[lm2[1], ridge2[1], lasso2[1]]})
model_df

Unnamed: 0,Model,Train Error,Test Error
0,Linear,0.100591,0.117832
1,Ridge,0.100591,0.117779
2,Lasso (e-4),0.103327,0.115286


Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,MoSold,YrSold,Total_porch_sf,TotSF,HS__1Story,HS__2.5Fin,HS__2.5Unf,HS__2Story,EQ__Fa,EQ__Gd,EQ__TA,HQC__Gd,HQC__TA,KQ__Fa,KQ__Gd,KQ__TA
0.010000,7055.201613,13801.423583,7969.256414,9770.165558,3833.914192,4858.637660,1308.386133,-6892.196481,4568.411466,3434.807235,6303.570995,-772.412298,-1927.325995,3303.167668,32458.798684,-1048.648088,1331.717417,-1797.557152,2096.101873,-2525.796592,-15728.641355,-18981.697245,-1945.172659,-3353.773196,-5970.670701,-19635.306426,-21448.114074
0.514949,7036.027048,13877.493228,7946.788569,9741.105529,3842.067855,4856.992689,1309.093924,-6861.486269,4563.124803,3432.491948,6312.787969,-751.572297,-1897.533378,3299.273803,32451.497675,-1015.857455,1313.205055,-1783.683706,2082.193592,-2458.143475,-15459.388873,-18706.469869,-1924.786262,-3342.973793,-5922.348210,-19540.276157,-21340.091924
1.019899,7016.857153,13953.526825,7924.296074,9712.035417,3850.167219,4855.341767,1309.801925,-6830.746613,4557.770060,3430.182543,6321.995429,-730.742549,-1867.764822,3295.365062,32444.227694,-983.083672,1294.713465,-1769.826113,2068.300995,-2390.738270,-15191.209415,-18432.349098,-1904.404375,-3332.169898,-5873.859196,-19444.736757,-21231.589662
1.524848,6997.695065,14029.500785,7901.763614,9682.949354,3858.179177,4853.681731,1310.510122,-6799.957809,4552.304924,3427.882872,6331.187461,-709.929331,-1838.035130,3291.432480,32437.005487,-950.337033,1276.255179,-1755.993582,2054.433896,-2323.731732,-14924.755662,-18160.007675,-1884.029360,-3321.358380,-5825.107004,-19348.389189,-21122.326953
2.029798,6978.543226,14105.397077,7879.179990,9653.843297,3866.079879,4852.010576,1311.218403,-6769.104568,4546.698366,3425.595863,6340.359720,-689.137180,-1808.355044,3287.469654,32429.841817,-917.624971,1257.839149,-1742.192449,2040.599541,-2257.232204,-14660.496763,-17889.928952,-1863.662674,-3310.536742,-5776.025082,-19251.025224,-21012.109676
2.534747,6959.389441,14181.307761,7856.605250,9624.740417,3873.999487,4850.341002,1311.926789,-6738.263496,4541.116414,3423.306519,6349.535421,-668.341434,-1778.666437,3283.511893,32422.669699,-884.907005,1239.416031,-1728.386314,2026.759430,-2190.646821,-14395.866097,-17619.467220,-1843.294851,-3299.717101,-5726.995694,-19153.825796,-20902.045711
3.039697,6940.233388,14257.235117,7834.040614,9595.641048,3881.940454,4848.673108,1312.635335,-6707.436580,4535.562432,3421.014469,6358.715026,-647.541622,-1748.968172,3279.559820,32415.488563,-852.182353,1220.984940,-1714.574637,2012.912759,-2123.964553,-14130.815860,-17348.573316,-1822.925835,-3288.899813,-5678.024457,-19056.809445,-20792.151899
3.544646,6921.087288,14333.087580,7811.427482,9566.523141,3889.776561,4846.995170,1313.343792,-6676.547193,4529.874317,3418.734610,6367.875872,-626.761690,-1719.316794,3275.579343,32408.360499,-819.490333,1202.593550,-1700.792084,1999.097227,-2057.759740,-13867.832688,-17079.810021,-1802.564280,-3278.072593,-5628.747379,-18958.846649,-20681.370937
4.049596,6901.940833,14408.942528,7788.815620,9537.405544,3897.615167,4845.317287,1314.052322,-6645.660003,4524.189706,3416.454348,6377.037200,-605.981277,-1689.664245,3271.599491,32401.232078,-786.797516,1184.201274,-1687.009029,1985.280851,-1991.543734,-13604.800990,-16810.996859,-1782.202710,-3267.245775,-5579.475527,-18860.901578,-20570.605867
4.554545,6882.794007,14484.800042,7766.205008,9508.288213,3905.456181,4843.639408,1314.760936,-6614.775107,4518.508538,3414.173665,6386.199000,-585.200399,-1660.010555,3267.620226,32394.103552,-754.103928,1165.808160,-1673.225540,1971.463637,-1925.316996,-13341.722755,-16542.135914,-1761.841167,-3256.419384,-5530.208188,-18762.972357,-20459.854775


In [134]:
X_train.shape


(1167, 27)

In [135]:
X_test.shape

(291, 27)

In [136]:
1167+291

1458

In [193]:
train_end = pd.concat([X_train, X_test], join="inner")
target_end = pd.concat([y_train, y_test], join="inner")

In [195]:
target_end.shape

(1458,)

In [196]:
parametersGrid = {"max_iter": [1, 5, 10],
                      "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                      "l1_ratio": np.arange(0.01, 1.0, 0.1)}

In [197]:
eNet = ElasticNet()

In [198]:
grid = GridSearchCV(eNet, parametersGrid, scoring='r2', cv=10)

In [214]:
grid.fit(X_train,y_train)

















































































GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_iter': [1, 5, 10], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio': array([0.01, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)

In [200]:
print(grid.best_estimator_.score(train_end,target_end))


0.8923279901588705


In [215]:
print(grid.best_estimator_.score(X_train,y_train))
print(grid.best_estimator_.score(X_test,y_test))

0.8980823119743877
0.8790542504889062


In [131]:
pd.concat()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,FullBath,HalfBath,BedroomAbvGr,TotRmsAbvGrd,Fireplaces,GarageArea,MoSold,YrSold,Total_porch_sf,TotSF,HS__2.5Fin,HS__2.5Unf,HS__SFoyer,HS__SLvl,EQ__Fa,EQ__TA,HQC__Fa,HQC__Gd,HQC__TA,KQ__Fa,KQ__Gd,KQ__TA
0,-0.192524,0.651865,-0.496973,1.046058,0.531172,1.381498,1.003507,0.173577,0.913929,-0.947056,0.346726,-1.605030,0.136814,-0.767891,0.017407,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,-1.285408,-0.193218,-0.447903,-0.642521,-0.165193,1.222125,-1.016416
1,0.069123,0.651865,-0.496973,0.979300,0.336903,1.381498,1.003507,0.173577,-0.320510,0.611779,0.630530,0.990277,0.136814,-0.891850,0.198643,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,-1.285408,-0.193218,-0.447903,-0.642521,-0.165193,1.222125,-1.016416
2,-0.089734,0.651865,-0.496973,-1.891326,-0.588731,0.010573,-0.792647,0.173577,0.296710,0.611779,0.791353,-1.605030,-1.369435,0.837046,-0.102985,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,2.232625,-0.642521,-0.165193,1.222125,-1.016416
3,0.350393,1.376367,-0.496973,0.945920,1.411097,1.381498,1.003507,1.408722,1.531148,0.611779,1.708987,2.102551,0.136814,0.634798,1.023267,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,-1.285408,-0.193218,-0.447903,-0.642521,-0.165193,1.222125,-1.016416
4,0.336844,-0.797138,-0.496973,0.712265,-0.588731,0.010573,1.003507,-2.296714,-0.937729,-0.947056,0.025081,1.361035,0.889939,1.378549,-0.510766,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,-0.447903,-0.642521,-0.165193,-0.818247,0.983849
5,-0.039834,1.376367,-0.496973,1.079438,0.474034,1.381498,-0.792647,0.173577,0.296710,0.611779,0.762972,0.619519,-0.616310,0.869666,1.071165,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,-1.285408,-0.193218,-0.447903,-0.642521,-0.165193,1.222125,-1.016416
6,-0.011988,0.651865,0.414926,0.044677,0.782579,1.381498,1.003507,0.173577,0.296710,2.170614,0.044001,1.731793,0.889939,3.185734,0.834263,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,-0.447903,-0.642521,-0.165193,-0.818247,0.983849
7,-0.410251,0.651865,-0.496973,-1.357256,-0.588731,0.010573,-0.792647,-1.061569,0.913929,2.170614,-0.031680,-0.863514,0.136814,0.758756,0.224534,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,2.232625,-0.642521,-0.165193,-0.818247,0.983849
8,-0.288773,-0.797138,0.414926,-1.090221,-0.588731,0.010573,-0.792647,-1.061569,-0.937729,2.170614,-1.275689,-1.975788,0.136814,-1.139767,-0.627275,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,-0.447903,-0.642521,-0.165193,-0.818247,0.983849
9,0.064451,-0.797138,-0.496973,-0.222357,-0.588731,0.010573,-0.792647,0.173577,-0.937729,-0.947056,-0.429006,-1.605030,0.136814,-1.165863,-0.611741,-0.065597,-0.092968,-0.153897,-0.226608,-0.097548,0.777963,-0.193218,-0.447903,-0.642521,-0.165193,-0.818247,0.983849


In [201]:
grid.best_params_

{'alpha': 0.01, 'l1_ratio': 0.6100000000000001, 'max_iter': 10}

In [40]:
grid.best_params_

{'alpha': 0.01, 'l1_ratio': 0.32, 'max_iter': 10}

In [216]:
grid.best_params_

{'alpha': 0.01, 'l1_ratio': 0.01, 'max_iter': 10}

In [217]:
elastic_net = grid.best_estimator_

In [226]:
dump(elastic_net, 'elastic_net.joblib')

['elastic_net.joblib']

In [218]:
test = pd.read_csv('test_clean.csv')

In [219]:
test = pd.DataFrame( ss.fit_transform( test ), columns = test.columns )

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [220]:
predictions1 = elastic_net.predict(test)

In [221]:
predictions = np.expm1(predictions1)

In [222]:
predictions

array([127791.17580794, 152241.51943806, 167502.58930414, ...,
       186109.39644628, 112101.95753763, 227563.58442861])

In [210]:
predictions

array([128075.56617358, 157454.65352765, 171387.19648969, ...,
       177413.68027591, 115899.68181459, 220053.19261634])

In [126]:
predictions

array([127791.17580794, 152241.51943806, 167502.58930414, ...,
       186109.39644628, 112101.95753763, 227563.58442861])

In [211]:
index = range(1461,(1461+len(predictions)))

In [223]:
predictions = pd.DataFrame(data=predictions, index=index, columns=['SalePrice'])
predictions

Unnamed: 0,SalePrice
1461,127791.175808
1462,152241.519438
1463,167502.589304
1464,198051.841272
1465,184405.720767
1466,168624.285396
1467,185455.721803
1468,159200.756471
1469,186571.044723
1470,110816.554260


In [224]:
predictions.to_csv("kaggle_submission1.csv")

In [225]:
dump(elastic_net, 'elastic_net.joblib')

['my_randomforest_model.joblib']