# Elastic Net Regression

In [11]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pickle
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score,KFold
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.preprocessing import RobustScaler
import warnings


warnings.filterwarnings("ignore")

# environment settings
data_path = 'Data/'

# Deserialize previously saved data from "preprocessing"
with open(data_path+'train_pp.obj', 'rb') as train_pp, \
open(data_path+'test_pp.obj','rb') as test_pp:
    train_df = pickle.load(train_pp)
    test_df = pickle.load(test_pp)
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,FullBath-Sq,TotRmsAbvGrd-Sq,Fireplaces-Sq,MasVnrArea-Sq,BsmtFinSF1-Sq,LotFrontage-Sq,WoodDeckSF-Sq,OpenPorchSF-Sq,2ndFlrSF-Sq,SalePrice
0,1,6.377215,6.557896,25.503637,2.578583,2.154845,17.874391,17.874391,9.383456,13.571795,...,1.108447,1.661092,0.0,3.063243,3.683992,2.560839,0.0,2.532642,3.780394,12.247699
1,2,4.192081,7.041123,26.291998,2.378866,2.759228,17.812419,17.812419,0.0,14.821045,...,1.108447,1.542357,0.86226,0.0,3.849811,2.653512,3.26117,0.0,0.0,12.109016
2,3,6.377215,6.661108,27.300424,2.578583,2.154845,17.869824,17.872108,8.848653,12.23756,...,1.108447,1.542357,0.86226,2.974669,3.498222,2.580912,0.0,2.368277,3.787509,12.317171
3,4,6.727938,6.377215,26.259338,2.578583,2.154845,17.669874,17.798555,0.0,9.664321,...,0.86226,1.605797,0.86226,0.0,3.108749,2.525315,0.0,2.288747,3.718514,11.849405
4,5,6.377215,7.157766,28.868815,2.759228,2.154845,17.867539,17.867539,11.144754,13.295773,...,1.108447,1.710107,0.86226,3.338376,3.646337,2.6754,3.053615,2.6754,3.887909,12.42922


In [12]:
X = train_df.loc[:,'Id':'2ndFlrSF-Sq']
y = train_df['SalePrice']
print("Shape of training set {}.\nShape of test set {}".format(X.shape,y.shape))

Shape of training set (1456, 349).
Shape of test set (1456,)


# Scaling
Using a robust scaler useful with outliers

In [13]:
scaler = RobustScaler()
X_scaled=scaler.fit(X).transform(X)
test_scaled=scaler.transform(test_df)

In [14]:
ENet = ElasticNetCV(alphas = [1, 0.1, 0.001, 0.0005],
                      cv=KFold(10, shuffle=True, 
                       random_state=1)).fit(X_scaled,y)
kf = KFold(5, shuffle=True, random_state=42).get_n_splits(X_scaled)
rmse_cv_enet= np.sqrt(-cross_val_score(ENet, X_scaled, y, scoring="neg_mean_squared_error", cv = kf))


print("The 10-fold crossvalidation RMSE of ENet is {:.5f} +/- {:.3f} , alpha :{}".format(rmse_cv_enet.mean(),
                                                                                          rmse_cv_enet.std(),
                                                                                          ENet.alpha_))

The 10-fold crossvalidation RMSE of ENet is 0.11029 +/- 0.006 , alpha :0.0005


In [15]:
ENet = ElasticNet(alpha=0.0008, l1_ratio=0.55, random_state=1)
kf = KFold(10, shuffle=True, random_state=42).get_n_splits(X_scaled)
rmse_cv_enet= np.sqrt(-cross_val_score(ENet, X_scaled, y, scoring="neg_mean_squared_error", cv = kf))
print("The 10-fold crossvalidation RMSE of Elastic Net is {:.5f} +/- {:.3f}".format(rmse_cv_enet.mean(),
                                                                                          rmse_cv_enet.std()))


The 10-fold crossvalidation RMSE of Elastic Net is 0.10758 +/- 0.015


Best is alpha .0008 , l1_ratio .55 , without label encoding -> 0.10854 +/- 0.014

In [None]:
# #Prediction on real test set using ENet
# ENet = ElasticNet(alpha=0.0008,l1_ratio=.55)
# ENet.fit(X_scaled,y)
# pred_results =ENet.predict(test_scaled)
# pred_results = np.expm1(pred_results)
# result_df = pd.DataFrame(data={'Id': test_df["Id"].values,
#                                'SalePrice': pred_results})
# #Create output csv file
# result_df.to_csv(data_path+"outputs/enet_0008_Scaled", index=False)