In [1]:
%matplotlib notebook
# We start off with the baseline import statements we need to do the basic data manipulation and visualization.
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew

import matplotlib.pyplot as plt
import calendar
from sklearn.ensemble import RandomForestClassifier

sns.set_style("whitegrid")

#We create and set aside a copy of the data for initial exploration
housing_train = pd.read_csv('../data/train.csv')
housing = housing_train.copy()

#MISSING DATA
total = housing.isnull().sum().sort_values(ascending=False)
percent = (housing.isnull().sum()/housing.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

#CORRELATION CHECK
corr_matrix = housing.corr()
top_corr = corr_matrix['SalePrice'].sort_values(ascending = False)

top_corr

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
Id              -0.021917
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePr

In [2]:
#DROPPING SOME COLUMNS
drop = ['PoolQC', 'PoolArea','MiscFeature', 'MiscVal', 'Alley', 'Fence', 'FireplaceQu', 'Fireplaces', 'LotFrontage']
drop2 = ['Id','GarageArea','1stFlrSF','GarageYrBlt','MSSubClass','BsmtFinSF2'] 
#dropped MSSubclass and BsmtFinSF2 and RoofMatl and Exterior2nd, and Condition2 for collinearity reasons
housing.drop(columns = drop + drop2, inplace = True)
housing['Age'] = housing['YrSold'] - housing['YearBuilt']
housing['AgeRemodel'] = housing['YrSold'] - housing['YearRemodAdd']
housing = housing[housing.AgeRemodel >= 0]
housing = housing[housing.GrLivArea < 4000]
housing.drop(columns = ['YearBuilt','YearRemodAdd'], inplace = True)

housing.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1459
Data columns (total 66 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1456 non-null   object 
 1   LotArea        1456 non-null   int64  
 2   Street         1456 non-null   object 
 3   LotShape       1456 non-null   object 
 4   LandContour    1456 non-null   object 
 5   Utilities      1456 non-null   object 
 6   LotConfig      1456 non-null   object 
 7   LandSlope      1456 non-null   object 
 8   Neighborhood   1456 non-null   object 
 9   Condition1     1456 non-null   object 
 10  Condition2     1456 non-null   object 
 11  BldgType       1456 non-null   object 
 12  HouseStyle     1456 non-null   object 
 13  OverallQual    1456 non-null   int64  
 14  OverallCond    1456 non-null   int64  
 15  RoofStyle      1456 non-null   object 
 16  RoofMatl       1456 non-null   object 
 17  Exterior1st    1456 non-null   object 
 18  Exterior

In [3]:
#FURTHER DATA CLEANING
housing_cat = housing.select_dtypes(exclude=[np.number])
housing_numeric = housing.select_dtypes(include=[np.number])

#Numeric
numeric_unbounded = ['LotArea', 'MasVnrArea','BsmtFinSF1', 'BsmtUnfSF',
                     'TotalBsmtSF','2ndFlrSF','LowQualFinSF','GrLivArea','WoodDeckSF',
                     'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch', 'SalePrice',
                     'Age','AgeRemodel']

#numeric_one_hot = ['MSSubClass','MoSold']
numeric_one_hot = ['MoSold']
numeric_ordinal = [x for x in housing_numeric.columns 
                   if (x not in numeric_unbounded and x not in numeric_one_hot)]

housing_numeric_unbounded = housing_numeric[numeric_unbounded]
#housing_numeric_unbounded['MasVnrArea'] = housing_numeric_unbounded['MasVnrArea'].fillna(0)
housing_numeric_one_hot = housing_numeric[numeric_one_hot]
housing_numeric_ordinal = housing_numeric[numeric_ordinal]

#housing_numeric_one_hot['MSSubClass'] = housing_numeric_one_hot['MSSubClass'].astype('str')
housing_numeric_one_hot['MoSold'] = housing_numeric_one_hot['MoSold'].replace({i:calendar.month_name[i][:3] for i in range(1,13)})
housing_numeric_one_hot = pd.get_dummies(housing_numeric_one_hot)
#Categorical
cat_ordinal = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
               'BsmtFinType1', 'HeatingQC', 'KitchenQual','Functional','GarageFinish',
               'GarageQual', 'GarageCond']

housing_cat_ordinal = housing_cat[cat_ordinal]
housing_cat_ordinal.fillna('No', inplace = True)
housing_cat_one_hot = housing_cat.drop(columns = cat_ordinal)

housing_cat_one_hot = pd.get_dummies(housing_cat_one_hot)
#housing_cat_one_hot = housing_cat_one_hot.drop(columns = 'Exterior2nd_CBlock')

def mapper(cat):
    if cat in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
               'HeatingQC', 'KitchenQual']:
        mapper = {'No':0, 'Po':1, 'Fa':2,'TA':3,'Gd':4,'Ex':5}
    elif cat == 'BsmtExposure':
            mapper = {'No':0,'No':1, 'Mn':2, 'Av':3,'Gd':4}
    elif cat == 'BsmtFinType1':
        mapper = {'No':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6}
    elif cat == 'Functional':
        mapper = {'Sal':0,'Sev':1,'Maj2':2,'Maj1':3,'Mod':4,'Min2':5, 'Min1':6,'Typ':7}
    else:
        mapper = {'No':0,'Unf':1,'RFn':2,'Fin':3}
        
    return mapper

for cat in cat_ordinal:
    housing_cat_ordinal[cat].replace(mapper(cat), inplace = True)

#Combining numeric and categorical
housing_ordinal = pd.concat([housing_numeric_ordinal,housing_cat_ordinal], axis = 'columns')
housing_one_hot = pd.concat([housing_numeric_one_hot, housing_cat_one_hot], axis = 'columns')
housing_clean = pd.concat([housing_one_hot, housing_ordinal, housing_numeric_unbounded], 
                          axis = 'columns')
#Getting rid of skew
skewed_feats = housing_clean[numeric_unbounded].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

housing_clean[skewed_feats] = np.log1p(housing_clean[skewed_feats])

#MORE CORRELATION
ordinal_prices = pd.concat([housing_ordinal, housing['SalePrice']], axis = 'columns')
ordinal_corr_matrix = ordinal_prices.corr()
top_corr_ordinal = ordinal_corr_matrix['SalePrice'].sort_values(ascending = False)

one_hot_prices = pd.concat([housing_one_hot, housing['SalePrice']], axis = 'columns')
one_hot_corr_matrix = one_hot_prices.corr()
top_corr_one_hot = one_hot_corr_matrix['SalePrice'].filter(like = 'Neighborhood').sort_values(ascending = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_numeric_one_hot['MoSold'] = housing_numeric_one_hot['MoSold'].replace({i:calendar.month_name[i][:3] for i in range(1,13)})
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [4]:
#REMOVAL OF THE REMAINING NaN
housing_clean.isnull().sum().sort_values(ascending=False)
df = housing_clean.copy()
problem_col = df.isin([np.nan, np.inf, -np.inf]).sum(axis=0)[df.isin([np.nan, np.inf, -np.inf]).sum(axis=0) != 0] 
index_to_drop = df[problem_col.index[0]][df[problem_col.index[0]].isin([np.nan, np.inf, -np.inf])].index
df.drop(index = index_to_drop, inplace = True)
df.isnull().sum().sort_values(ascending=False)

MoSold_Apr          0
Foundation_Slab     0
Foundation_Wood     0
BsmtFinType2_ALQ    0
BsmtFinType2_BLQ    0
                   ..
BldgType_1Fam       0
BldgType_2fmCon     0
BldgType_Duplex     0
BldgType_Twnhs      0
AgeRemodel          0
Length: 228, dtype: int64

In [5]:
#RANDOM FOREST FOR FEATURE IMPORTANCE
X_train = df.drop(columns = ['SalePrice'])
y_train = df['SalePrice']

forest = RandomForestClassifier(n_estimators=500, max_depth=4)

forest.fit(X_train, y_train)

forest.feature_importances_
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': forest.feature_importances_})

ValueError: Unknown label type: 'continuous'

In [None]:
#We will look at feature importances and their correlation with the 'SalePrice'
score_df.sort_values('feature', inplace=True)
top_corr = df.corr()['SalePrice'].abs().drop(index = ['SalePrice']) #I suppose we want to look at the absolute value
                                                                    #of the correlation. Is that right?
top_corr.sort_index(inplace=True) 
#now rows of score_df and top_corr match and we can add the values of correlation
score_df['correlation'] = top_corr.values

In [None]:
score_df.sort_values(ascending=False, by = ['importance_score'], inplace = True)
score_df['importance_score_rank'] = [k for k in range(1,1+len(score_df.index))]
score_df.sort_values(ascending=False, by = ['correlation'], inplace = True)
score_df['correlation_rank'] = [k for k in range(1,1+len(score_df.index))]
score_df['overall_rank'] = (score_df['importance_score_rank'] + score_df['correlation_rank'])/2 
score_df.sort_values(ascending=True, by = ['overall_rank'], inplace = True)
score_df.reset_index(drop = True, inplace = True)
score_df.head(20)

Correlation measures only linear dependence between variables and it does not detect non-linear dependence (in particular cor(X,Y) can be 0 for random variables X and Y=X^2, which are of course completely dependent). So, if a given feature has high feature_importance score, but low correlation it means that 'SalePrice' depend on it in a non-linear manner.

In [None]:
plt.figure(figsize=(7, 7))
plt.scatter(score_df['importance_score_rank'][:20], score_df['correlation_rank'][:20], c ="blue",
            linewidths = 1)
plt.title('importance_rank vs correlation_rank')
plt.xticks(np.arange(0, 40, step=1))
plt.yticks(np.arange(0, 40, step=1))
plt.xlabel("importance_score_rank")
plt.ylabel("correlation_rank")
plt.show()

## Using Linear Regression models to predict housing prices
Below, we compare several candidate models to predict the housing prices. We will compare the baselined model of sale price average along with ordinary least squares, ridge regression, lasso regression, and elastic net regression. We will be operating with log housing prices to combat heteroscedasticity.

### Ordinary Least Squares Regression Coefficients blow up due to collinearity in categorical data
One weird thing is that the ordinary Least Squares Regression Coefficients are on radically different scales from one another. We see very negative coefficients along with very large positive coefficients. We make some plots of the scales of the regression coefficients below. We notice that the scales of the categorical variables seem to be much greater than those of the unbounded or the ordinal variables. Ordinary Least Squares is ill fit to deal with the multicollinearity of these categorical variables. This is taken care of via regularization which limits the size of the coefficients. We'll describe regularization later. OLS blowup can be mitigated by dummy encoding the variables instead of one-hot but I was still having collinearity problems on cross validation for the dummy encoding.

In [None]:
#We scale all the data note X_train and y_train are fairly clean
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
X = df.drop(columns = 'SalePrice')
y = (df['SalePrice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 20)

#We call the standard Scaler object and fit it
scaler = StandardScaler()
scaler.fit(X_train)


#We work with the scaled training set and log prices
X_train = (scaler.transform(X_train))
X_test = (scaler.transform(X_test))

In [None]:
#Our baseline model looking at the average
y_pred_average = np.ones(len(y_test)) * np.mean(y_train)

In [None]:
#Our Simple Linear Regression Model
from sklearn.linear_model import LinearRegression
slr = LinearRegression(copy_X = True)
slr.fit(X_train, y_train)
y_pred_slr = (slr.predict(X_test))


In [None]:
#Comparison of the RMSE. It is huge on the new data but relatively good on the training set suggesting overfit
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_pred_average, squared = False))
print(mean_squared_error(y_train, (slr.predict(X_train)), squared = False))

print(mean_squared_error((y_test), (y_pred_slr), squared = False))

Here we record some of the blow up of the coefficients

In [None]:
#Dataframe showing which coefficients correspond
print('Linear Regression Coefficients')
slr_coefficients = pd.DataFrame(slr.coef_.reshape(1,-1), columns = X.columns)
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
print_full(slr_coefficients.T[0].sort_values(ascending = False))

In [None]:
# A plot of the scales of the linear regression coefficients.
log_slr_coef = np.log10(np.abs(slr.coef_))

fig, ax = plt.subplots()
sns.scatterplot(data = log_slr_coef)
plt.show()


In [None]:
#Plots of the unbounded and ordinal variable coefficient scales
numeric_train_unbounded = [x for x in numeric_unbounded if x != 'SalePrice']
log_coefs = np.log10(np.abs(slr_coefficients))
fig, ax = plt.subplots()
sns.scatterplot(data = log_coefs[numeric_train_unbounded])
fig, ax = plt.subplots()
sns.scatterplot(data = log_coefs[numeric_ordinal + cat_ordinal])


# Ordinary least squares on the numerical data
Instead we look at an OLS baseline for the numerical data, excluding the categorical variables.

In [None]:
X = df.drop(columns = 'SalePrice')
y = df['SalePrice']
numeric_train_unbounded = [x for x in numeric_unbounded if x!= 'SalePrice']
numeric_train = numeric_train_unbounded + numeric_ordinal + cat_ordinal

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 20)

#We call the standard Scaler object and fit it
X_train = X_train[numeric_train]
scaler = StandardScaler()
scaler.fit_transform(X_train)

slr = LinearRegression(copy_X = True)
slr.fit(X_train, y_train)
y_pred_slr = (slr.predict(X_train))

In [None]:
slr_coefficients = pd.DataFrame(slr.coef_.reshape(1,-1), columns = X_train.columns)
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')
print_full(slr_coefficients.T[0].sort_values(ascending = False))

In [None]:
# A plot of the scales of the linear regression coefficients.
log_slr_coef = np.log10(np.abs(slr.coef_))
fig, ax = plt.subplots()
sns.scatterplot(data = log_slr_coef)
plt.show()

plt.figure(figsize=(10,8))


residuals = y_train - y_pred_slr
plt.scatter(y_pred_slr, residuals)


plt.xlabel("predictions", fontsize=16)
plt.ylabel("residuals", fontsize=16)

plt.show()

In [None]:
X = df.drop(columns = 'SalePrice')
y = (df['SalePrice'])
numeric_train_unbounded = [x for x in numeric_unbounded if x!= 'SalePrice']
numeric_train = numeric_train_unbounded + numeric_ordinal + cat_ordinal
X = X[numeric_train]
from sklearn.model_selection import KFold
kfold = KFold(5, shuffle = True, random_state = 69)

mses = np.zeros((2,5))
i = 0

for train_index, test_index in kfold.split(X):
    x_t = X.iloc[train_index]
    scaler = StandardScaler()
    scaler.fit(x_t)
    x_t_scale = scaler.transform(x_t)
    y_t = y.iloc[train_index]
    
    x_ho = X.iloc[test_index]
    x_ho_scale = scaler.transform(x_ho)
    y_ho = y.iloc[test_index]
    
    pred0 = y_t.mean() * np.ones(len(x_ho))
    
    
    model = LinearRegression(copy_X = True)
    model.fit(x_t_scale,y_t)
    
    pred1 = (model.predict(x_ho_scale))
    
    mses[0,i] = mean_squared_error(np.exp(y_ho),np.exp(pred0), squared = False)
    
    mses[1,i] = mean_squared_error(np.exp(y_ho),np.exp(pred1), squared = False)
    
    i+=1

In [None]:
plt.figure(figsize=(8,5))

plt.scatter(np.zeros(5), 
            mses[0,:], 
            s=60, 
            c='white',
            edgecolor='black',
            label="Single Split")
plt.scatter(np.ones(5), 
            mses[1,:], 
            s=60, 
            c='white',
            edgecolor='black')

plt.scatter([0,1], np.mean(mses, axis=1), s=60, c='r', label="Mean")

plt.legend(fontsize=14)

plt.xticks([0,1],["Baseline", "OLS"], fontsize=14)
plt.yticks(fontsize=14)

plt.xlabel("Model", fontsize=16)
plt.ylabel("RMSE", fontsize=16)

plt.show()

(np.mean(mses[1]))

## Ridge Regression
The OLS coefficient blowup is due to the artifically high model complexity and the multicollinearity indtroduced by the one hot encoding of the categorical variables. We can keep them in check by constraining them with lasso,ridge and elastic net regularization models and seeing how they control the coefficients.

In [None]:
#We use gridsearchCV to fit a Ridge Regression model to the data
X = df.drop(columns = 'SalePrice')
y = (df['SalePrice'])

#Import Statements
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler


ridge_pipe = Pipeline([('scale',RobustScaler()),
                              ('ridge',Ridge())])


#Grid Search CV
parameters = {'ridge__alpha': np.logspace(0, 4, 10)}
ridge_cv = GridSearchCV(ridge_pipe,parameters,scoring='neg_mean_squared_error',cv=5)
ridge_cv.fit(X, y)

In [None]:
print(ridge_cv.best_score_)

# Lasso Estimator

In [None]:
X = df.drop(columns = 'SalePrice')
y = (df['SalePrice'])

from sklearn.linear_model import Lasso

lasso_pipe = Pipeline([('scale',RobustScaler()),
                              ('lasso',Lasso())])

parameters = {'lasso__alpha': np.logspace(-3, -1, 10)}
lasso_cv = GridSearchCV(lasso_pipe,parameters,scoring='neg_mean_squared_error',cv=5)
lasso_cv.fit(X,y)


In [None]:
lasso_cv.best_score_

# Comparison of the Ridge and Lasso Estimators
Here we look at the coefficients of the best Ridge and Lasso estimators.

In [None]:
ridge_estimator = ridge_cv.best_estimator_
lasso_estimator = lasso_cv.best_estimator_

ridge_coefs = ridge_estimator['ridge'].coef_
lasso_coefs = lasso_estimator['lasso'].coef_

In [None]:
ridge_coefs = pd.DataFrame(ridge_coefs.reshape(1,-1), columns = X.columns)
lasso_coefs= pd.DataFrame(lasso_coefs.reshape(1,-1), columns = X.columns)

In [None]:
#Coefficients of the Ridge Regression
print_full(ridge_coefs.T[0].sort_values(ascending = False))

In [None]:
#Coefficients of the Lasso Regression
print_full(lasso_coefs[lasso_coefs != 0].T[0].sort_values(ascending = False))

In [None]:
y_hat = ridge_estimator.predict(X)
residuals = y - y_hat

plt.figure(figsize=(10,8))

plt.scatter(y_hat, residuals)


plt.xlabel("predictions", fontsize=16)
plt.ylabel("residuals", fontsize=16)

plt.show()

plt.figure(figsize=(10,8))

plt.scatter(y, y_hat)


plt.xlabel("y", fontsize=16)
plt.ylabel("y_hat", fontsize=16)

plt.show()

In [None]:
y_hat = lasso_estimator.predict(X)
residuals = y - y_hat

plt.figure(figsize=(10,8))

plt.scatter(y_hat, residuals)
plt.xlabel("predictions", fontsize=16)
plt.ylabel("residuals", fontsize=16)

plt.show()

plt.figure(figsize=(10,8))

plt.scatter(y, y_hat)


plt.xlabel("y", fontsize=16)
plt.ylabel("y_hat", fontsize=16)

plt.show()

## Elastic Net
A mixture of both the lasso and the ridge regression.

In [None]:
X = df.drop(columns = 'SalePrice')
y = (df['SalePrice'])
from sklearn.linear_model import ElasticNet

elastic_pipe = Pipeline([('scale',RobustScaler()),
                              ('elastic',ElasticNet())])
parameters = {'elastic__alpha': np.logspace(-3, 0, 10), 'elastic__l1_ratio': [.1, .5, .7, .9, .95, .99,
1]}

elastic_cv = GridSearchCV(elastic_pipe,parameters,scoring='neg_mean_squared_error',cv=5)
elastic_cv.fit(X,y)


In [None]:
elastic_cv.best_score_

In [None]:
elastic_estimator = elastic_cv.best_estimator_
elastic_coefs = elastic_estimator['elastic'].coef_
elastic_coefs = pd.DataFrame(elastic_coefs.reshape(1,-1), columns = X.columns)
print_full(elastic_coefs[elastic_coefs != 0].T[0].sort_values(ascending = False))

In [None]:
y_hat = elastic_estimator.predict(X)
residuals = y - y_hat

plt.figure(figsize=(10,8))

plt.scatter(y_hat, residuals)


plt.xlabel("predictions", fontsize=16)
plt.ylabel("residuals", fontsize=16)

plt.show()

plt.figure(figsize=(10,8))

plt.scatter(y, y_hat)


plt.xlabel("y", fontsize=16)
plt.ylabel("y_hat", fontsize=16)

plt.show()

## Using ElasticNet for feature selection
Running ElasticNet on the dataset with zero'd features removed

In [None]:
non_zero = elastic_coefs.loc[:, (elastic_coefs != 0).any(axis=0)]

In [None]:
X = df[non_zero.columns]
y = (df['SalePrice'])

In [None]:
elastic_pipe = Pipeline([('scale',RobustScaler()),
                              ('elastic',ElasticNet())])
parameters = {'elastic__alpha': np.logspace(-3, 0, 10), 'elastic__l1_ratio': [.1, .5, .7, .9, .95, .99,
1]}

elastic_cv = GridSearchCV(elastic_pipe,parameters,scoring='neg_mean_squared_error',cv=5)
elastic_cv.fit(X,y)


In [None]:
print(elastic_cv.best_score_)
elastic_estimator = elastic_cv.best_estimator_
elastic_coefs = elastic_estimator['elastic'].coef_
elastic_coefs = pd.DataFrame(elastic_coefs.reshape(1,-1), columns = X.columns)
print_full(elastic_coefs[elastic_coefs != 0].T[0].sort_values(ascending = False))

In [None]:
y_hat = elastic_estimator.predict(X)
print(mean_squared_error(np.exp(y),np.exp(y_hat), squared = False))

# Further Ideas/Comments
- Introduce interaction terms between the top k most important features which are numeric and categorical after elastic net selection. (Regression after elastic net selection seems to do a good job?).
- For some reason after I introduced the skew the forest stopped working?
- Check out the boosted tree models, and ensemble them with the linear regression models