

# Modeling
- **Establish your baseline score.**
- Fit linear regression. Look at your coefficients. Are any of them wildly overblown?
- Fit lasso/ridge/elastic net with default parameters.
- Go back and remove features that might be causing issues in your models.
- Tune hyperparameters.
- **Identify a production model.** (This does not have to be your best performing Kaggle model, but rather the model that best answers your problem statement.)
- Refine and interpret your production model.

encode ordinal columns (0,1,2,3)
dummy nominal columns (ohe)

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector
import statistics

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
# Lowercaseifying
def lowercaseify(df):
	# List lowercase columns
	new_cols = [column.lower().replace(" ", "_") for column in df.columns]
	
	# Dictionary of old:new columns
	old_cols = df.columns
	to_map = dict(zip(df.columns, new_cols))

	# Rename columns
	df.rename(columns = to_map, inplace = True)

	return df.columns

In [4]:
lowercaseify(train)

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [5]:
lowercaseify(test)

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     1721 non-null   float64
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            140 non-null    object 
 8   lot_shape        2051 non-null   object 
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   object 
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   object 
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     718 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            58 non-null     object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

## Pre-processing
- One-hot encode categorical variables.
- Train/test split your data.
- Scale your data.
- Consider using automated feature selection.


In [8]:
train.select_dtypes(include='int64').isna().sum()

id                 0
pid                0
ms_subclass        0
lot_area           0
overall_qual       0
overall_cond       0
year_built         0
year_remod/add     0
1st_flr_sf         0
2nd_flr_sf         0
low_qual_fin_sf    0
gr_liv_area        0
full_bath          0
half_bath          0
bedroom_abvgr      0
kitchen_abvgr      0
totrms_abvgrd      0
fireplaces         0
wood_deck_sf       0
open_porch_sf      0
enclosed_porch     0
3ssn_porch         0
screen_porch       0
pool_area          0
misc_val           0
mo_sold            0
yr_sold            0
saleprice          0
dtype: int64

In [9]:
train.select_dtypes(include='float64').isna().sum()

lot_frontage      330
mas_vnr_area       22
bsmtfin_sf_1        1
bsmtfin_sf_2        1
bsmt_unf_sf         1
total_bsmt_sf       1
bsmt_full_bath      2
bsmt_half_bath      2
garage_yr_blt     114
garage_cars         1
garage_area         1
dtype: int64

In [10]:
train.select_dtypes(include='object').isna().sum()

ms_zoning            0
street               0
alley             1911
lot_shape            0
land_contour         0
utilities            0
lot_config           0
land_slope           0
neighborhood         0
condition_1          0
condition_2          0
bldg_type            0
house_style          0
roof_style           0
roof_matl            0
exterior_1st         0
exterior_2nd         0
mas_vnr_type        22
exter_qual           0
exter_cond           0
foundation           0
bsmt_qual           55
bsmt_cond           55
bsmt_exposure       58
bsmtfin_type_1      55
bsmtfin_type_2      56
heating              0
heating_qc           0
central_air          0
electrical           0
kitchen_qual         0
functional           0
fireplace_qu      1000
garage_type        113
garage_finish      114
garage_qual        114
garage_cond        114
paved_drive          0
pool_qc           2042
fence             1651
misc_feature      1986
sale_type            0
dtype: int64

### Simple Imputing

#### Change year columns to integers representing time from 2010

In [11]:
train[['year_remod/add', 'year_built', 'yr_sold', 'garage_yr_blt']] = train[['year_remod/add', 'year_built', 'yr_sold', 'garage_yr_blt']].apply(lambda x: 2010 - x)

#### Impute discrete values with mode

In [12]:
si_most_frequent = SimpleImputer(strategy='most_frequent')

In [13]:
discretes = ['year_built',
             'year_remod/add', 
             'bsmt_full_bath', 
             'bsmt_half_bath',
             'garage_yr_blt',
             'garage_cars',
            ]

In [14]:
discretes

['year_built',
 'year_remod/add',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'garage_yr_blt',
 'garage_cars']

In [15]:
train[discretes] = si_most_frequent.fit_transform(train[discretes])

#### Impute object datatypes with 'NA'

In [16]:
si_na = SimpleImputer(strategy='constant', fill_value='NA')

In [17]:
objects = train.select_dtypes(include='object').columns

In [18]:
objects

Index(['ms_zoning', 'street', 'alley', 'lot_shape', 'land_contour',
       'utilities', 'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'roof_style', 'roof_matl',
       'exterior_1st', 'exterior_2nd', 'mas_vnr_type', 'exter_qual',
       'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_cond', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_type_2', 'heating', 'heating_qc',
       'central_air', 'electrical', 'kitchen_qual', 'functional',
       'fireplace_qu', 'garage_type', 'garage_finish', 'garage_qual',
       'garage_cond', 'paved_drive', 'pool_qc', 'fence', 'misc_feature',
       'sale_type'],
      dtype='object')

In [19]:
train[objects] = si_na.fit_transform(train[objects])

#### Impute continous values with mean

In [20]:
si_mean = SimpleImputer(strategy='mean')

In [21]:
continous = ['lot_frontage', 
             'mas_vnr_area',
             'bsmtfin_sf_1',
             'bsmtfin_sf_2',
             'bsmt_unf_sf',
             'total_bsmt_sf',
             'garage_area',
           ]

In [22]:
continous             

['lot_frontage',
 'mas_vnr_area',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 'garage_area']

In [23]:
train[continous] = si_mean.fit_transform(train[continous])

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     2051 non-null   float64
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            2051 non-null   object 
 8   lot_shape        2051 non-null   object 
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   object 
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   object 
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

#### Impute Test Data

In [25]:
test[['year_remod/add', 'year_built', 'yr_sold', 'garage_yr_blt']] = test[['year_remod/add', 'year_built', 'yr_sold', 'garage_yr_blt']].apply(lambda x: 2010 - x)

In [26]:
test[discretes] = si_most_frequent.fit_transform(test[discretes])

In [27]:
test[objects] = si_na.fit_transform(test[objects])

In [28]:
test[continous] = si_mean.fit_transform(test[continous])

In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     878 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            878 non-null    object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

### Ordinal Transforming

In [30]:
def ord1(a_string):
    cats = {'Reg':1,'IR1':2,'IR2':3,'IR3':4, 'NA':0}
    return cats[a_string]

train.lot_shape = train.lot_shape.apply(ord1)
test.lot_shape = test.lot_shape.apply(ord1)

In [31]:
def ord2(a_string):
    cats = {'AllPub':1,'NoSewr':2,'NoSeWa':3,'ELO':4,'NA':0}
    return cats[a_string]

train.utilities = train.utilities.apply(ord2)
test.utilities = test.utilities.apply(ord2)

In [32]:
def ord3(a_string):
    cats = {'Gtl':1,'Mod':2,'Sev':3,'NA':0}
    return cats[a_string]

train.land_slope = train.land_slope.apply(ord3)
test.land_slope = test.land_slope.apply(ord3)

In [33]:
def ord4(a_string):
    cats = {'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,'NA':0}
    return cats[a_string]

train.exter_qual = train.exter_qual.apply(ord4)
test.exter_qual = test.exter_qual.apply(ord4)

train.exter_cond = train.exter_cond.apply(ord4)
test.exter_cond = test.exter_cond.apply(ord4)

train.heating_qc = train.heating_qc.apply(ord4)
test.heating_qc = test.heating_qc.apply(ord4)

train.kitchen_qual = train.kitchen_qual.apply(ord4)
test.kitchen_qual = test.kitchen_qual.apply(ord4)

train.bsmt_qual = train.bsmt_qual.apply(ord4)
test.bsmt_qual = test.bsmt_qual.apply(ord4)

train.bsmt_cond = train.bsmt_cond.apply(ord4)
test.bsmt_cond = test.bsmt_cond.apply(ord4)

train.fireplace_qu = train.fireplace_qu.apply(ord4)
test.fireplace_qu = test.fireplace_qu.apply(ord4)

train.garage_qual = train.garage_qual.apply(ord4)
test.garage_qual = test.garage_qual.apply(ord4)

train.garage_cond = train.garage_cond.apply(ord4)
test.garage_cond = test.garage_cond.apply(ord4)

train.pool_qc = train.pool_qc.apply(ord4)
test.pool_qc = test.pool_qc.apply(ord4)

In [34]:
def ord5(a_string):
    cats = {'Gd':1,'Av':2,'Mn':3,'No':4,'NA':0}
    return cats[a_string]

train.bsmt_exposure = train.bsmt_exposure.apply(ord5)
test.bsmt_exposure = test.bsmt_exposure.apply(ord5)

In [35]:
def ord6(a_string):
    cats = {'GLQ':1,'ALQ':2,'BLQ':3,'Rec':4,'LwQ':5,'Unf':6,'NA':0}
    return cats[a_string]

train.bsmtfin_type_1 = train.bsmtfin_type_1.apply(ord6)
test.bsmtfin_type_1 = test.bsmtfin_type_1.apply(ord6)

train.bsmtfin_type_2 = train.bsmtfin_type_2.apply(ord6)
test.bsmtfin_type_2 = test.bsmtfin_type_2.apply(ord6)

In [36]:
train.central_air = train.central_air.apply(lambda x: 1 if x == 'Y' else 0)
test.central_air = test.central_air.apply(lambda x: 1 if x == 'Y' else 0)

In [37]:
def ord7(a_string):
    cats = {'SBrkr':1,'FuseA':2,'FuseF':3,'FuseP':4,'Mix':5,'NA':0}
    return cats[a_string]

train.electrical = train.electrical.apply(ord7)
test.electrical = test.electrical.apply(ord7)

In [38]:
def ord8(a_string):
    cats = {'Typ':1,'Min1':2,'Min2':3,'Mod':4,'Maj1':5,'Maj2':6,'Sev':7,'Sal':8, 'NA':0}
    return cats[a_string]

train.functional = train.functional.apply(ord8)
test.functional = test.functional.apply(ord8)

In [39]:
def ord9(a_string):
    cats = {'Fin':1,'RFn':2,'Unf':3,'NA':0}
    return cats[a_string]

train.garage_finish = train.garage_finish.apply(ord9)
test.garage_finish = test.garage_finish.apply(ord9)

In [40]:
def ord10(a_string):
    cats = {'Y':1,'P':1.5,'N':0, 'NA':0}
    return cats[a_string]

train.paved_drive = train.paved_drive.apply(ord10)
test.paved_drive = test.paved_drive.apply(ord10)

In [41]:
def ord11(a_string):
    cats = {'GdPrv':1,'MnPrv':2,'GdWo':3,'MnWw':4,'NA':0}
    return cats[a_string]

train.fence = train.fence.apply(ord11)
test.fence = test.fence.apply(ord11)

### Check for numeric values
All object datatypes should be nominal variables.

In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2051 non-null   int64  
 1   pid              2051 non-null   int64  
 2   ms_subclass      2051 non-null   int64  
 3   ms_zoning        2051 non-null   object 
 4   lot_frontage     2051 non-null   float64
 5   lot_area         2051 non-null   int64  
 6   street           2051 non-null   object 
 7   alley            2051 non-null   object 
 8   lot_shape        2051 non-null   int64  
 9   land_contour     2051 non-null   object 
 10  utilities        2051 non-null   int64  
 11  lot_config       2051 non-null   object 
 12  land_slope       2051 non-null   int64  
 13  neighborhood     2051 non-null   object 
 14  condition_1      2051 non-null   object 
 15  condition_2      2051 non-null   object 
 16  bldg_type        2051 non-null   object 
 17  house_style   

## Train/Test Split

In [43]:
X = train.drop(columns=['saleprice'])
X = pd.get_dummies(data=X, columns=X.select_dtypes('object').columns)

In [44]:
y = train.saleprice

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model 1: Baseline with Linear Regression

### Model: Linear Regression

In [46]:
lr = LinearRegression()

In [47]:
lr.fit(X_train, y_train)

### Score

In [48]:
lr.score(X_train, y_train)

0.932399922306766

In [49]:
lr.score(X_test, y_test)

0.904776059928151

In [50]:
rmse_baseline = mean_squared_error(y_test, lr.predict(X_test), squared=True)
rmse_baseline

565774515.3238001

### Pipeline

In [51]:
lr1 = Pipeline(
    steps=[
           ("ss", StandardScaler()), 
           ("lr2", LinearRegression())]
)

In [52]:
lr1.fit(X_train, y_train)

In [53]:
lr1.score(X_test, y_test)

-2.8137765678061345e+23

In [54]:
mean_squared_error(y_test, lr1.predict(X_test), squared=True)

1.6718097073895532e+33

## Model 2: StandardScaler(), OneHotEncoder() & Linear Regression

In [55]:
lr2 = Pipeline(
    steps=[("poly", PolynomialFeatures()),
           ("ss", StandardScaler()), 
           ("lr2", LinearRegression())]
)

### Pipeline

In [56]:
lr2.fit(X_train, y_train)

### Score

In [57]:
lr2.score(X_train, y_train)

1.0

In [58]:
lr2.score(X_test, y_test)

-5487.8658372581385

In [59]:
rmse_lr2 = mean_squared_error(y_train, lr2.predict(X_train), squared=True)
rmse_lr2

8.100321981182873e-19

In [60]:
#Compare Model 1 to Model 2
rmse_lr2 - rmse_baseline 

-565774515.3238001

*Model 2 perfomed better than the baseline*

### Preprocessor

### Pipeline

### Score

## Model 3: Polynomial Features, Linear Regression

### Preprocessing

In [61]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer

In [62]:
preprocessor = ColumnTransformer([("ss", 
                                  StandardScaler(),
                                  make_column_selector(dtype_exclude='object')),
                                 ("poly", 
                                  PolynomialFeatures(include_bias=False),  
                                  make_column_selector(dtype_exclude='object'))])

### Pipeline

In [63]:
pipe3 = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("lr", LinearRegression())]
)

In [64]:
pipe3.fit(X_train, y_train)

In [65]:
pipe3.score(X_train, y_train)

0.9783621840401989

In [66]:
pipe3.score(X_test, y_test)

0.33020255888124517

In [67]:
rmse_lr3 = mean_squared_error(y_test, pipe3.predict(X_test), squared=True)
rmse_lr3

3979611874.1584744

In [68]:
(rmse_lr3 - rmse_baseline) < (rmse_lr2 - rmse_baseline)

False

*Model 3 did not perform better than Model 2*

## Model 4: Polynomial Features, Ridge, & GridSearchCV

In [69]:
pipe4 = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("ridge", Ridge())]
)

In [70]:
pipe4.fit(X_train, y_train)



In [71]:
params = {'alpha': np.logspace(0,4,50)}
ridge_grid = GridSearchCV(pipe4.named_steps["ridge"], param_grid=params, n_jobs=-1)
ridge_grid.fit(X_train,y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

In [72]:
ridge_grid.best_score_

0.827773356928966

In [73]:
rmse_ridge = mean_squared_error(y_test, ridge_grid.predict(X_test), squared=True)
rmse_ridge

570245860.7059993

In [74]:
(rmse_ridge - rmse_baseline) < (rmse_lr2 - rmse_baseline)

False

*Model 4 did not perform better than model 2*

## Model 5: Lasso & GridSearchCV

In [75]:
pipe5 = Pipeline(steps=[("prep", preprocessor), 
                       ("lasso", Lasso())])

In [76]:
pipe5.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [77]:
params = {'alpha': np.logspace(0,4,50)}
lasso_grid = GridSearchCV(pipe5.named_steps["lasso"], param_grid=params, n_jobs=-1)
lasso_grid.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [78]:
lasso_grid.best_score_

0.8315836028533703

In [79]:
rmse_lasso = mean_squared_error(y_test, lasso_grid.predict(X_test), squared=True)
rmse_lasso

496756404.6341859

In [80]:
(rmse_lasso - rmse_baseline) < (rmse_lr2 - rmse_baseline)

False

*Model 5 did not perform better than model 2*

## Model 6: Polynomial Features, ElasticNet & GridSearchCV

In [81]:
pipe6 = Pipeline(steps=[("prep", preprocessor), 
                       ("elastic", ElasticNet())])

In [82]:
pipe6.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [83]:
params = {'alpha': np.logspace(0,4,50)}
elastic_grid = GridSearchCV(pipe6.named_steps["elastic"], param_grid=params, n_jobs=-1)
elastic_grid.fit(X_train,y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [84]:
elastic_grid.best_score_

0.7921547883335492

In [85]:
rmse_elastic = mean_squared_error(y_test, elastic_grid.predict(X_test), squared=True)
rmse_elastic

662231662.0303545

In [86]:
(rmse_elastic - rmse_baseline) < (rmse_lr2 - rmse_baseline)

False

*Model 6 did not perform better than Model 2*

## Process for submitting to Kaggle

**Model 2 performed the best according to RMSE**

In [110]:
predictions = pd.DataFrame(lr2.predict(X_test), columns=['sample_soln'])

In [111]:
predictions['Id'] = list(range(predictions.shape[0]))

In [112]:
predictions

Unnamed: 0,sample_soln,Id
0,133651.014836,0
1,184953.407944,1
2,266077.013624,2
3,86934.366633,3
4,168527.079864,4
...,...,...
406,293585.204874,406
407,53843.037251,407
408,264223.411307,408
409,288620.633856,409
