# Cross-Validation with Ridge and Lasso

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
#SK LEARN
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error


We will euse the Kings County housing dataset in this notebook

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/learn-co-curriculum/dsc-mod-2-project-v2-1/master/kc_house_data.csv',index_col=0)

## Data Cleaning and Prep

In [3]:
df['date'] = pd.to_datetime(df['date'])

df['yr_sold']=  df['date'].map(lambda x: x.year)

df['yrs_old'] =  df['yr_built'].map(lambda x: 2016-x)

df['yr_since_reno'] =  df['yr_renovated'].map(lambda x: 2016-x if x > 0 else np.nan)

df['yrs_since_update'] = df.apply(lambda x: min(x['yrs_old'], x['yr_since_reno']), axis=1)


df['bedrooms']=df['bedrooms'].map(lambda x: x if x < 10 else 10)

df.replace('?', 0, inplace=True)

df['sqft_basement'] = pd.to_numeric(df['sqft_basement'])

df.fillna(0, inplace=True)

In [4]:
zip_df = pd.get_dummies(df['zipcode'], drop_first=True)


In [5]:
df.shape

(21597, 24)

In [6]:
zip_df.shape

(21597, 69)

In [7]:
target= np.log(df.price)

features = df.drop(columns=['date', 'price', 'lat', 'long', 'yr_built', 'yr_renovated', 'yr_since_reno', 'zipcode'])

In [8]:
polynomial_features_2= PolynomialFeatures(degree=2, include_bias=False)
features_poly = polynomial_features_2.fit_transform(features)
poly_columns = polynomial_features_2.get_feature_names(features.columns)

In [9]:
features_poly = pd.DataFrame(features_poly, columns=poly_columns)

In [10]:
zip_df.reset_index(drop=True, inplace=True)

In [11]:
features_poly.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,view sqft_living15,view sqft_lot15,view yr_sold,view yrs_old,view yrs_since_update,condition^2,condition grade,condition sqft_above,condition sqft_basement,condition sqft_living15,condition sqft_lot15,condition yr_sold,condition yrs_old,condition yrs_since_update,grade^2,grade sqft_above,grade sqft_basement,grade sqft_living15,grade sqft_lot15,grade yr_sold,grade yrs_old,grade yrs_since_update,sqft_above^2,sqft_above sqft_basement,sqft_above sqft_living15,sqft_above sqft_lot15,sqft_above yr_sold,sqft_above yrs_old,sqft_above yrs_since_update,sqft_basement^2,sqft_basement sqft_living15,sqft_basement sqft_lot15,sqft_basement yr_sold,sqft_basement yrs_old,sqft_basement yrs_since_update,sqft_living15^2,sqft_living15 sqft_lot15,sqft_living15 yr_sold,sqft_living15 yrs_old,sqft_living15 yrs_since_update,sqft_lot15^2,sqft_lot15 yr_sold,sqft_lot15 yrs_old,sqft_lot15 yrs_since_update,yr_sold^2,yr_sold yrs_old,yr_sold yrs_since_update,yrs_old^2,yrs_old yrs_since_update,yrs_since_update^2
0,3.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,9.0,3.0,3540.0,16950.0,3.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,1.0,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,1392400.0,6667000.0,1180.0,...,0.0,0.0,0.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,49.0,8260.0,0.0,9380.0,39550.0,14098.0,427.0,427.0,1392400.0,0.0,1581200.0,6667000.0,2376520.0,71980.0,71980.0,0.0,0.0,0.0,0.0,0.0,0.0,1795600.0,7571000.0,2698760.0,81740.0,81740.0,31922500.0,11379100.0,344650.0,344650.0,4056196.0,122854.0,122854.0,3721.0,3721.0,3721.0
1,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,2170.0,400.0,1690.0,7639.0,2014.0,65.0,25.0,9.0,6.75,7710.0,21726.0,6.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,5.0625,5782.5,16294.5,4.5,0.0,0.0,6.75,15.75,4882.5,900.0,3802.5,17187.75,4531.5,146.25,56.25,6604900.0,18611940.0,5140.0,...,0.0,0.0,0.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,49.0,15190.0,2800.0,11830.0,53473.0,14098.0,455.0,175.0,4708900.0,868000.0,3667300.0,16576630.0,4370380.0,141050.0,54250.0,160000.0,676000.0,3055600.0,805600.0,26000.0,10000.0,2856100.0,12909910.0,3403660.0,109850.0,42250.0,58354321.0,15384946.0,496535.0,190975.0,4056196.0,130910.0,50350.0,4225.0,1625.0,625.0
2,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,4.0,2.0,1540.0,20000.0,2.0,0.0,0.0,6.0,12.0,1540.0,0.0,5440.0,16124.0,4030.0,166.0,166.0,1.0,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,592900.0,7700000.0,770.0,...,0.0,0.0,0.0,0.0,0.0,9.0,18.0,2310.0,0.0,8160.0,24186.0,6045.0,249.0,249.0,36.0,4620.0,0.0,16320.0,48372.0,12090.0,498.0,498.0,592900.0,0.0,2094400.0,6207740.0,1551550.0,63910.0,63910.0,0.0,0.0,0.0,0.0,0.0,0.0,7398400.0,21928640.0,5480800.0,225760.0,225760.0,64995844.0,16244930.0,669146.0,669146.0,4060225.0,167245.0,167245.0,6889.0,6889.0,6889.0
3,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,1050.0,910.0,1360.0,5000.0,2014.0,51.0,51.0,16.0,12.0,7840.0,20000.0,4.0,0.0,0.0,20.0,28.0,4200.0,3640.0,5440.0,20000.0,8056.0,204.0,204.0,9.0,5880.0,15000.0,3.0,0.0,0.0,15.0,21.0,3150.0,2730.0,4080.0,15000.0,6042.0,153.0,153.0,3841600.0,9800000.0,1960.0,...,0.0,0.0,0.0,0.0,0.0,25.0,35.0,5250.0,4550.0,6800.0,25000.0,10070.0,255.0,255.0,49.0,7350.0,6370.0,9520.0,35000.0,14098.0,357.0,357.0,1102500.0,955500.0,1428000.0,5250000.0,2114700.0,53550.0,53550.0,828100.0,1237600.0,4550000.0,1832740.0,46410.0,46410.0,1849600.0,6800000.0,2739040.0,69360.0,69360.0,25000000.0,10070000.0,255000.0,255000.0,4056196.0,102714.0,102714.0,2601.0,2601.0,2601.0
4,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,1680.0,0.0,1800.0,7503.0,2015.0,29.0,29.0,9.0,6.0,5040.0,24240.0,3.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,4.0,3360.0,16160.0,2.0,0.0,0.0,6.0,16.0,3360.0,0.0,3600.0,15006.0,4030.0,58.0,58.0,2822400.0,13574400.0,1680.0,...,0.0,0.0,0.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,64.0,13440.0,0.0,14400.0,60024.0,16120.0,232.0,232.0,2822400.0,0.0,3024000.0,12605040.0,3385200.0,48720.0,48720.0,0.0,0.0,0.0,0.0,0.0,0.0,3240000.0,13505400.0,3627000.0,52200.0,52200.0,56295009.0,15118545.0,217587.0,217587.0,4060225.0,58435.0,58435.0,841.0,841.0,841.0


In [14]:
features_ploy = pd.merge(features_poly, zip_df, left_index=True, right_index=True)

In [16]:
features_ploy

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,98032,98033,98034,98038,98039,98040,98042,98045,98052,98053,98055,98056,98058,98059,98065,98070,98072,98074,98075,98077,98092,98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199
0,3.0,1.00,1180.0,5650.0,1.0,0.0,0.0,3.0,7.0,1180.0,0.0,1340.0,5650.0,2014.0,61.0,61.0,9.0,3.00,3540.0,16950.0,3.0,0.0,0.0,9.0,21.0,3540.0,0.0,4020.0,16950.0,6042.0,183.0,183.0,1.0000,1180.0,5650.0,1.0,0.0,0.0,3.00,7.00,1180.0,0.0,1340.0,5650.00,2014.0,61.00,61.00,1392400.0,6667000.0,1180.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3.0,7.0,2170.0,400.0,1690.0,7639.0,2014.0,65.0,25.0,9.0,6.75,7710.0,21726.0,6.0,0.0,0.0,9.0,21.0,6510.0,1200.0,5070.0,22917.0,6042.0,195.0,75.0,5.0625,5782.5,16294.5,4.5,0.0,0.0,6.75,15.75,4882.5,900.0,3802.5,17187.75,4531.5,146.25,56.25,6604900.0,18611940.0,5140.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2.0,1.00,770.0,10000.0,1.0,0.0,0.0,3.0,6.0,770.0,0.0,2720.0,8062.0,2015.0,83.0,83.0,4.0,2.00,1540.0,20000.0,2.0,0.0,0.0,6.0,12.0,1540.0,0.0,5440.0,16124.0,4030.0,166.0,166.0,1.0000,770.0,10000.0,1.0,0.0,0.0,3.00,6.00,770.0,0.0,2720.0,8062.00,2015.0,83.00,83.00,592900.0,7700000.0,770.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4.0,3.00,1960.0,5000.0,1.0,0.0,0.0,5.0,7.0,1050.0,910.0,1360.0,5000.0,2014.0,51.0,51.0,16.0,12.00,7840.0,20000.0,4.0,0.0,0.0,20.0,28.0,4200.0,3640.0,5440.0,20000.0,8056.0,204.0,204.0,9.0000,5880.0,15000.0,3.0,0.0,0.0,15.00,21.00,3150.0,2730.0,4080.0,15000.00,6042.0,153.00,153.00,3841600.0,9800000.0,1960.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,3.0,2.00,1680.0,8080.0,1.0,0.0,0.0,3.0,8.0,1680.0,0.0,1800.0,7503.0,2015.0,29.0,29.0,9.0,6.00,5040.0,24240.0,3.0,0.0,0.0,9.0,24.0,5040.0,0.0,5400.0,22509.0,6045.0,87.0,87.0,4.0000,3360.0,16160.0,2.0,0.0,0.0,6.00,16.00,3360.0,0.0,3600.0,15006.00,4030.0,58.00,58.00,2822400.0,13574400.0,1680.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,3.0,2.50,1530.0,1131.0,3.0,0.0,0.0,3.0,8.0,1530.0,0.0,1530.0,1509.0,2014.0,7.0,7.0,9.0,7.50,4590.0,3393.0,9.0,0.0,0.0,9.0,24.0,4590.0,0.0,4590.0,4527.0,6042.0,21.0,21.0,6.2500,3825.0,2827.5,7.5,0.0,0.0,7.50,20.00,3825.0,0.0,3825.0,3772.50,5035.0,17.50,17.50,2340900.0,1730430.0,4590.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21593,4.0,2.50,2310.0,5813.0,2.0,0.0,0.0,3.0,8.0,2310.0,0.0,1830.0,7200.0,2015.0,2.0,2.0,16.0,10.00,9240.0,23252.0,8.0,0.0,0.0,12.0,32.0,9240.0,0.0,7320.0,28800.0,8060.0,8.0,8.0,6.2500,5775.0,14532.5,5.0,0.0,0.0,7.50,20.00,5775.0,0.0,4575.0,18000.00,5037.5,5.00,5.00,5336100.0,13428030.0,4620.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
21594,2.0,0.75,1020.0,1350.0,2.0,0.0,0.0,3.0,7.0,1020.0,0.0,1020.0,2007.0,2014.0,7.0,7.0,4.0,1.50,2040.0,2700.0,4.0,0.0,0.0,6.0,14.0,2040.0,0.0,2040.0,4014.0,4028.0,14.0,14.0,0.5625,765.0,1012.5,1.5,0.0,0.0,2.25,5.25,765.0,0.0,765.0,1505.25,1510.5,5.25,5.25,1040400.0,1377000.0,2040.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
21595,3.0,2.50,1600.0,2388.0,2.0,0.0,0.0,3.0,8.0,1600.0,0.0,1410.0,1287.0,2015.0,12.0,12.0,9.0,7.50,4800.0,7164.0,6.0,0.0,0.0,9.0,24.0,4800.0,0.0,4230.0,3861.0,6045.0,36.0,36.0,6.2500,4000.0,5970.0,5.0,0.0,0.0,7.50,20.00,4000.0,0.0,3525.0,3217.50,5037.5,30.00,30.00,2560000.0,3820800.0,3200.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
#call train_test_split on the data and capture the results
X_train, X_test, y_train, y_test = train_test_split(features_poly, target, random_state=22,test_size=0.25)


In [18]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled =pd.DataFrame(data=scaler.transform(X_train), columns=poly_columns)
X_test_scaled =pd.DataFrame(data=scaler.transform(X_test), columns=poly_columns)

In [19]:
X_test_scaled

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,yr_sold,yrs_old,yrs_since_update,bedrooms^2,bedrooms bathrooms,bedrooms sqft_living,bedrooms sqft_lot,bedrooms floors,bedrooms waterfront,bedrooms view,bedrooms condition,bedrooms grade,bedrooms sqft_above,bedrooms sqft_basement,bedrooms sqft_living15,bedrooms sqft_lot15,bedrooms yr_sold,bedrooms yrs_old,bedrooms yrs_since_update,bathrooms^2,bathrooms sqft_living,bathrooms sqft_lot,bathrooms floors,bathrooms waterfront,bathrooms view,bathrooms condition,bathrooms grade,bathrooms sqft_above,bathrooms sqft_basement,bathrooms sqft_living15,bathrooms sqft_lot15,bathrooms yr_sold,bathrooms yrs_old,bathrooms yrs_since_update,sqft_living^2,sqft_living sqft_lot,sqft_living floors,...,view sqft_living15,view sqft_lot15,view yr_sold,view yrs_old,view yrs_since_update,condition^2,condition grade,condition sqft_above,condition sqft_basement,condition sqft_living15,condition sqft_lot15,condition yr_sold,condition yrs_old,condition yrs_since_update,grade^2,grade sqft_above,grade sqft_basement,grade sqft_living15,grade sqft_lot15,grade yr_sold,grade yrs_old,grade yrs_since_update,sqft_above^2,sqft_above sqft_basement,sqft_above sqft_living15,sqft_above sqft_lot15,sqft_above yr_sold,sqft_above yrs_old,sqft_above yrs_since_update,sqft_basement^2,sqft_basement sqft_living15,sqft_basement sqft_lot15,sqft_basement yr_sold,sqft_basement yrs_old,sqft_basement yrs_since_update,sqft_living15^2,sqft_living15 sqft_lot15,sqft_living15 yr_sold,sqft_living15 yrs_old,sqft_living15 yrs_since_update,sqft_lot15^2,sqft_lot15 yr_sold,sqft_lot15 yrs_old,sqft_lot15 yrs_since_update,yr_sold^2,yr_sold yrs_old,yr_sold yrs_since_update,yrs_old^2,yrs_old yrs_since_update,yrs_since_update^2
0,0.698568,-0.474522,0.136010,-0.181476,0.930263,-0.078021,-0.303802,-0.628011,0.291369,0.501745,-0.647824,-0.107805,-0.190030,1.446652,-0.236968,-0.173623,0.572810,-0.117102,0.269874,-0.146784,1.164248,-0.073901,-0.29044,0.128589,0.629841,0.579045,-0.578532,0.195623,-0.140322,0.700065,0.043532,0.109782,-0.542118,-0.263112,-0.191848,0.061846,-0.072187,-0.278164,-0.683035,-0.341176,-0.103535,-0.537788,-0.391958,-0.200087,-0.473759,-0.293331,-0.232239,-0.056599,-0.159966,0.509276,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,-0.620832,-0.346034,0.216671,-0.623635,-0.394025,-0.226044,-0.626491,-0.391734,-0.341037,0.207787,0.352367,-0.604220,-0.061403,-0.176433,0.293688,-0.120753,-0.049805,0.235876,-0.471805,0.075146,-0.122050,0.502656,0.276448,0.369168,-0.441842,-0.539149,-0.215453,-0.647828,-0.566993,-0.561861,-0.228493,-0.208074,-0.106862,-0.181833,-0.118398,-0.092790,-0.189931,-0.207197,-0.199343,1.446652,-0.236535,-0.173184,-0.452995,-0.416630,-0.404742
1,0.698568,-0.474522,0.103180,-0.254240,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-0.625955,1.408665,-0.342119,-0.324746,1.446652,0.714699,0.791977,0.572810,-0.117102,0.245257,-0.227542,-0.457905,-0.073901,-0.29044,0.128589,0.195405,-0.313058,1.357447,0.018473,-0.294102,0.700065,1.106272,1.200823,-0.542118,-0.276054,-0.239340,-0.778122,-0.072187,-0.278164,-0.683035,-0.555261,-0.572751,0.683965,-0.487639,-0.282906,-0.473759,0.547921,0.673254,-0.081037,-0.211308,-0.502387,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,-0.620832,-0.868131,-0.793959,1.037730,-0.585501,-0.345671,-0.626491,0.321648,0.375485,-0.577401,-0.609195,1.083052,-0.449521,-0.321293,-0.561279,0.642955,0.733659,-0.549414,0.634432,-0.524622,-0.260783,-0.625444,0.280915,0.373914,0.874384,0.828663,-0.005919,1.409376,1.766566,1.899987,-0.406213,-0.320195,-0.341260,0.597931,0.686591,-0.097532,-0.324689,-0.229963,-0.223602,1.446652,0.715454,0.792741,0.460432,0.530046,0.539025
2,-0.411261,-0.474522,-0.115684,-0.116716,-0.913867,-0.078021,-0.303802,0.912485,0.291369,0.222851,-0.647824,-0.019937,-0.160093,-0.691251,0.170889,0.240206,-0.474802,-0.535811,-0.322988,-0.144025,-0.863443,-0.073901,-0.29044,0.128589,-0.239031,-0.114013,-0.578532,-0.283236,-0.195854,-0.411796,0.024555,0.090299,-0.542118,-0.362334,-0.149581,-0.778122,-0.072187,-0.278164,-0.051305,-0.341176,-0.219578,-0.537788,-0.356077,-0.181683,-0.474890,0.067205,0.155829,-0.235381,-0.133207,-0.593119,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,0.811738,1.046225,0.680329,-0.623635,0.463626,-0.106418,0.911568,0.338634,0.392545,0.207787,0.149236,-0.604220,0.000081,-0.148167,0.290294,0.343272,0.426224,0.002487,-0.471805,-0.022144,-0.124100,0.222478,0.578979,0.690581,-0.441842,-0.539149,-0.215453,-0.647828,-0.566993,-0.561861,-0.157842,-0.177762,-0.020394,0.289077,0.367746,-0.091336,-0.160141,-0.086758,-0.071004,-0.691251,0.170617,0.239928,-0.121752,-0.073330,-0.062497
3,0.698568,-0.474522,-0.389264,-0.153583,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-1.123114,1.317266,-0.283540,-0.146996,-0.691251,0.000948,0.067777,0.572810,-0.117102,-0.123999,-0.115827,-0.457905,-0.073901,-0.29044,0.128589,0.195405,-0.706350,1.271404,0.062760,-0.091198,0.697861,0.309217,0.382543,-0.542118,-0.470183,-0.173643,-0.778122,-0.072187,-0.278164,-0.683035,-0.555261,-0.779610,0.629665,-0.463719,-0.173631,-0.474890,-0.083018,-0.005866,-0.407341,-0.172691,-0.706534,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,-0.620832,-0.868131,-1.239505,0.963892,-0.537632,-0.187830,-0.628785,-0.213389,-0.161907,-0.577401,-0.926036,1.008062,-0.413656,-0.174445,-0.564249,-0.067584,0.004740,-0.761925,0.244007,-0.731595,-0.231642,-1.123296,-0.635206,-0.599384,0.759987,0.797746,0.222531,1.316969,0.953356,1.042070,-0.363240,-0.190971,-0.283962,-0.037035,0.031084,-0.090654,-0.147046,-0.110855,-0.096681,-0.691251,0.000703,0.067527,-0.270749,-0.227750,-0.216442
4,-0.411261,0.176627,-0.630015,-0.181112,-0.913867,-0.078021,-0.303802,-0.628011,-0.563268,-0.771465,0.151922,-0.649656,-0.150776,-0.691251,-0.440897,-0.380538,-0.474802,-0.176917,-0.612238,-0.197628,-0.863443,-0.073901,-0.29044,-0.646340,-0.564858,-0.703952,-0.013871,-0.640305,-0.187877,-0.411796,-0.487838,-0.435738,0.000623,-0.380206,-0.157169,-0.538131,-0.072187,-0.278164,-0.141553,-0.127092,-0.467520,0.073089,-0.350097,-0.119105,0.176165,-0.198905,-0.130602,-0.539391,-0.200332,-0.806339,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,-0.620832,-0.868131,-0.924362,0.022451,-0.836812,-0.191186,-0.628785,-0.544602,-0.494578,-0.577401,-0.701929,0.051942,-0.637816,-0.177568,-0.564249,-0.507441,-0.446495,-0.620080,-0.082251,-0.661422,-0.202783,-0.771698,-0.673784,-0.640370,-0.242783,-0.071053,-0.039261,0.151799,-0.126995,-0.097674,-0.615889,-0.225924,-0.650028,-0.605144,-0.555403,-0.090854,-0.150825,-0.218242,-0.211113,-0.691251,-0.441072,-0.380716,-0.584739,-0.553170,-0.540862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,-1.521091,-1.451244,-1.034914,-0.301779,-0.913867,-0.078021,-0.303802,0.912485,0.291369,-0.795717,-0.647824,-0.444631,-0.354683,-0.691251,1.496426,1.585149,-1.223096,-1.313414,-1.071758,-0.315838,-1.268982,-0.073901,-0.29044,-0.904649,-1.107903,-0.989329,-0.578532,-0.989069,-0.396622,-1.521454,0.290240,0.363060,-1.101820,-0.933626,-0.288278,-1.138109,-0.072187,-0.278164,-1.134270,-1.075180,-0.887724,-0.537788,-0.960069,-0.332861,-1.451473,0.092958,0.183548,-0.720783,-0.264636,-0.974194,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,0.811738,1.046225,-0.536773,-0.623635,0.000893,-0.336810,0.911568,1.663487,1.723229,0.207787,-0.592635,-0.604220,-0.297092,-0.331893,0.290294,1.851355,1.973317,-0.631177,-0.471805,-0.622821,-0.278232,-0.795946,0.621009,0.735234,-0.441842,-0.539149,-0.215453,-0.647828,-0.566993,-0.561861,-0.479079,-0.344422,-0.445031,1.216001,1.324656,-0.098186,-0.354694,-0.210869,-0.203256,-0.691251,1.495943,1.584656,1.578689,1.689009,1.694426
5396,-0.411261,-0.148948,-0.257946,-0.291228,-0.913867,-0.078021,-0.303802,2.452981,-0.563268,-1.050359,1.454365,-0.737524,-0.333203,1.446652,1.462438,1.550664,-0.474802,-0.356364,-0.402993,-0.289289,-0.863443,-0.073901,-0.29044,0.903517,-0.564858,-0.869423,0.905719,-0.690128,-0.344059,-0.410144,1.106272,1.200823,-0.287708,-0.305020,-0.256527,-0.658127,-0.072187,-0.278164,1.031659,-0.341176,-0.683028,0.889530,-0.522666,-0.275702,-0.148069,1.586609,1.791259,-0.327708,-0.244255,-0.652095,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,2.653615,1.568323,-0.507795,2.206839,0.272150,-0.269575,2.455746,2.376870,2.439752,-0.577401,-0.879669,1.120547,-0.691615,-0.328280,-0.561279,1.387330,1.497288,-0.735936,0.347372,-0.785466,-0.279947,-1.049998,0.222845,0.312220,0.933533,0.643335,-0.013365,1.455091,2.613561,2.793546,-0.670881,-0.341296,-0.736808,0.865725,0.963048,-0.097731,-0.333149,-0.167437,-0.156975,1.446652,1.463445,1.551682,1.523168,1.631467,1.637061
5397,1.808398,3.106794,2.105788,-0.202990,0.930263,-0.078021,-0.303802,-0.628011,1.146006,2.684391,-0.647824,2.498938,-0.227451,1.446652,-1.494529,-1.449595,1.919739,3.591467,2.567470,-0.125484,1.975325,-0.073901,-0.29044,0.903517,2.041759,3.264944,-0.578532,3.187800,-0.112556,1.810273,-1.351314,-1.322209,4.122063,3.225036,-0.038895,2.701747,-0.072187,-0.278164,2.295118,2.900671,3.975977,-0.537788,4.140118,0.015505,3.108824,-1.357772,-1.377964,2.023697,-0.082763,2.142453,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,-0.620832,0.176063,2.172729,-0.623635,1.736141,-0.259274,-0.626491,-1.334418,-1.287870,1.097667,2.383681,-0.604220,2.235285,-0.182616,1.148655,-1.546665,-1.512601,2.951578,-0.471805,3.274111,-0.006677,2.686077,-1.339758,-1.347909,-0.441842,-0.539149,-0.215453,-0.647828,-0.566993,-0.561861,2.796630,-0.061315,2.500824,-1.433416,-1.410470,-0.094402,-0.227364,-0.466252,-0.475390,1.446652,-1.494520,-1.449585,-0.905630,-0.885741,-0.872412
5398,-0.411261,-0.148948,-0.257946,0.042031,-0.913867,-0.078021,-0.303802,2.452981,-0.563268,-1.001856,1.362965,-0.254251,0.151698,-0.691251,-0.033040,0.033291,-0.474802,-0.356364,-0.402993,-0.011884,-0.863443,-0.073901,-0.29044,0.903517,-0.564858,-0.840645,0.841186,-0.416099,0.071082,-0.411796,-0.146243,-0.085046,-0.287708,-0.305020,-0.007944,-0.658127,-0.072187,-0.278164,1.031659,-0.341176,-0.659963,0.827473,-0.297130,0.064987,-0.149362,0.075789,0.165069,-0.327708,-0.051617,-0.652095,...,-0.283617,-0.116044,-0.303803,-0.261719,-0.259414,2.653615,1.568323,-0.435348,2.083774,0.930348,0.448075,2.451922,0.508487,0.563146,-0.577401,-0.848758,1.045557,-0.395723,0.072321,-0.564249,-0.101419,-0.029970,-0.717636,0.345823,-0.671542,-0.129883,-1.002056,-0.563330,-0.523022,0.816536,0.844122,0.641550,1.362662,0.954142,1.042899,-0.341389,0.027160,-0.254677,-0.053568,0.014016,-0.067556,0.151594,0.203515,0.238310,-0.691251,-0.033280,0.033047,-0.298666,-0.256683,-0.245287


In [20]:
#instantiate a linear regression object
lm = LinearRegression()

#fit the linear regression to the data
lm = lm.fit(X_train_scaled, y_train)


print(lm.intercept_)
print(lm.coef_)

13.049903861156425
[-2.91305445e+00  1.23870022e+01  2.07153924e+01 -4.39179096e+00
 -1.90808328e+01 -8.41455939e+00 -1.51481175e+01  3.36490873e+01
 -1.07452644e+01 -8.86740641e+00 -1.33832994e+01  9.00938543e+00
 -2.62928673e+00  1.48549963e-02  2.10498164e+01 -6.74514164e+01
  4.20955744e-02  3.60245391e-02 -1.77223612e-01  4.69499387e-02
  2.56430934e-02  5.35700275e-03 -1.93386422e-02 -2.32943924e-02
  3.72462724e-02  1.07697021e-01  7.81923727e-03  7.95898903e-02
 -1.07808292e-02  2.73691438e+00  4.69290332e-02  8.41215226e-02
 -8.72827541e-02  7.98901874e-01 -5.02274369e-03 -7.48220924e-02
  4.87133674e-03 -1.65172959e-02  3.24840845e-02  1.10047810e-01
 -5.89293144e-01 -2.08101398e-01 -8.55275875e-02 -2.37199242e-02
 -1.23101934e+01 -4.18824069e-02 -2.14053709e-02 -8.29762053e-01
  4.32568850e-02 -1.43949691e-01  4.00095127e-03 -7.88699499e-04
  4.87221244e-03 -2.03428149e-01  8.39757778e-01  6.52766937e-02
 -2.97020647e-01 -8.64609879e-02 -2.01017450e+01 -1.03290882e-01
  8.86

In [22]:
y_train_pred = lm.predict(X_train)
y_pred = lm.predict(X_test)

In [23]:
train_mae = metrics.mean_absolute_error(y_train, y_train_pred)
train_mse = metrics.mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))


print('Mean Absolute Error:', train_mae )
print('Mean Squared Error:',  train_mse)
print('Root Mean Squared Error:' , train_rmse)

Mean Absolute Error: 188323862.07597286
Mean Squared Error: 8.28947623045237e+17
Root Mean Squared Error: 910465607.8321888


In [26]:
sum(np.abs(lm.coef_))

525.1409558976251

In [27]:
from sklearn.feature_selection import RFECV
ols = LinearRegression()

In [29]:
#recursive wrapper method

# Create recursive feature eliminator that scores features by mean squared errors
selector = RFECV(estimator=ols, step=3, cv=5, scoring='neg_mean_squared_error', verbose =1, n_jobs=-1)

# Fit recursive feature eliminator 
selector.fit(X_train_scaled, y_train)



Fitting estimator with 152 features.
Fitting estimator with 149 features.
Fitting estimator with 146 features.
Fitting estimator with 143 features.
Fitting estimator with 140 features.
Fitting estimator with 137 features.
Fitting estimator with 134 features.
Fitting estimator with 131 features.
Fitting estimator with 128 features.
Fitting estimator with 125 features.
Fitting estimator with 122 features.
Fitting estimator with 119 features.
Fitting estimator with 116 features.
Fitting estimator with 113 features.
Fitting estimator with 110 features.
Fitting estimator with 107 features.
Fitting estimator with 104 features.
Fitting estimator with 101 features.
Fitting estimator with 98 features.
Fitting estimator with 95 features.
Fitting estimator with 92 features.
Fitting estimator with 89 features.
Fitting estimator with 86 features.
Fitting estimator with 83 features.


RFECV(cv=5,
      estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                 normalize=False),
      min_features_to_select=1, n_jobs=-1, scoring='neg_mean_squared_error',
      step=3, verbose=1)

In [30]:
selected_columns = X_train_scaled.columns[selector.support_]
removed_columns = X_train_scaled.columns[~selector.support_]

In [31]:
len(selected_columns)

80

In [43]:
removed_columns

Index(['yr_sold', 'bedrooms sqft_lot', 'bedrooms floors',
       'bedrooms waterfront', 'bedrooms view', 'bedrooms condition',
       'bedrooms grade', 'bedrooms sqft_basement', 'bedrooms sqft_lot15',
       'bedrooms yrs_since_update', 'bathrooms sqft_lot',
       'bathrooms waterfront', 'bathrooms view', 'bathrooms condition',
       'bathrooms sqft_lot15', 'bathrooms yrs_since_update',
       'sqft_living waterfront', 'sqft_living view', 'sqft_living condition',
       'sqft_living sqft_lot15', 'sqft_living yrs_since_update', 'sqft_lot^2',
       'sqft_lot floors', 'sqft_lot waterfront', 'sqft_lot view',
       'sqft_lot condition', 'sqft_lot sqft_above', 'sqft_lot sqft_basement',
       'sqft_lot sqft_lot15', 'sqft_lot yrs_since_update', 'floors waterfront',
       'floors view', 'floors grade', 'floors sqft_lot15', 'floors yrs_old',
       'waterfront view', 'waterfront condition', 'waterfront grade',
       'waterfront sqft_above', 'waterfront sqft_basement',
       'waterfront s

In [32]:
lm_rfe = LinearRegression()

lm_rfe = lm_rfe.fit(X_train_scaled[selected_columns], y_train)

y_rfe=lm_rfe.predict(X_train_scaled[selected_columns])

trainRFE_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_rfe))

print('Training Root Mean Squared Error:' , trainRFE_rmse)

y_pred_rfe = lm_rfe.predict(X_test_scaled[selected_columns])

testRFE_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfe))

print('Testing Root Mean Squared Error:' , testRFE_rmse)


print("Train RMSE", int(trainRFE_rmse), "Test RMSE: ", int(testRFE_rmse))

Training Root Mean Squared Error: 0.2934380808883874
Testing Root Mean Squared Error: 0.29886499473433853
Train RMSE 0 Test RMSE:  0


In [44]:
#Calculate sum of coefficients for this model
sum(np.abs(lm_rfe.coef_))

541.4110461177663

In [45]:
## training the model
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01, normalize=False)

lasso.fit(X_train,y_train)

y_train_pred_lasso = lasso.predict(X_train_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

train_rmse_lasso = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('Training Error: '+ str(train_rmse_lasso) )
print('Testing Error: '+ str(test_rmse_lasso) )



Training Error: 44.891729447054225
Testing Error: 44.88817795524521


  positive)


In [46]:
from sklearn.linear_model import SGDRegressor


In [47]:
lasso = SGDRegressor(penalty='l1', alpha=1, early_stopping=True, verbose=1)

lasso.fit(X_train_scaled,y_train)




-- Epoch 1
Norm: 6.83, NNZs: 0, Bias: 13.048590, T: 14577, Avg. loss: 0.968616
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 6.91, NNZs: 0, Bias: 13.059233, T: 29154, Avg. loss: 0.121341
Total training time: 0.07 seconds.
-- Epoch 3
Norm: 6.97, NNZs: 0, Bias: 13.058774, T: 43731, Avg. loss: 0.121360
Total training time: 0.11 seconds.
-- Epoch 4
Norm: 7.02, NNZs: 0, Bias: 13.052976, T: 58308, Avg. loss: 0.121366
Total training time: 0.14 seconds.
-- Epoch 5
Norm: 7.07, NNZs: 0, Bias: 13.058428, T: 72885, Avg. loss: 0.121336
Total training time: 0.17 seconds.
-- Epoch 6
Norm: 7.11, NNZs: 0, Bias: 13.052514, T: 87462, Avg. loss: 0.121333
Total training time: 0.20 seconds.
Convergence after 6 epochs took 0.21 seconds


SGDRegressor(alpha=1, average=False, early_stopping=True, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l1', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=1,
             warm_start=False)

In [48]:
y_train_pred_lasso = lasso.predict(X_train_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)

train_rmse_lasso = metrics.mean_absolute_error(y_train, y_train_pred_lasso)
test_rmse_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('Training Error: '+ str(train_rmse_lasso) )
print('Testing Error: '+ str(test_rmse_lasso) )

Training Error: 0.41134272482995826
Testing Error: 0.5380729040193435


In [49]:
#Calculate sum of coefficients for this model

In [52]:
ridge = SGDRegressor(penalty='l2', alpha=1, early_stopping=True, verbose=1)

ridge.fit(X_train_scaled,y_train)



-- Epoch 1
Norm: 29963178409.69, NNZs: 152, Bias: -1240170486.549013, T: 14577, Avg. loss: 2941763450783918981120.000000
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 35718231.52, NNZs: 152, Bias: 534755.295555, T: 29154, Avg. loss: 142626393679406628864.000000
Total training time: 0.04 seconds.
-- Epoch 3
Norm: 4064.99, NNZs: 152, Bias: 610.568530, T: 43731, Avg. loss: 43831842018299.242188
Total training time: 0.06 seconds.
-- Epoch 4
Norm: 1.14, NNZs: 152, Bias: 13.026520, T: 58308, Avg. loss: 1415791.847110
Total training time: 0.08 seconds.
-- Epoch 5
Norm: 0.11, NNZs: 152, Bias: 13.041625, T: 72885, Avg. loss: 0.072558
Total training time: 0.12 seconds.
-- Epoch 6
Norm: 0.11, NNZs: 152, Bias: 13.059753, T: 87462, Avg. loss: 0.054903
Total training time: 0.14 seconds.
-- Epoch 7
Norm: 0.18, NNZs: 152, Bias: 13.062134, T: 102039, Avg. loss: 0.055063
Total training time: 0.16 seconds.
-- Epoch 8
Norm: 0.10, NNZs: 152, Bias: 13.052131, T: 116616, Avg. loss: 0.049709
Total train

SGDRegressor(alpha=1, average=False, early_stopping=True, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=1,
             warm_start=False)

In [51]:
y_train_pred_ridge = ridge.predict(X_train_scaled)
y_pred_ridge = ridge.predict(X_test_scaled)

train_rmse_ridge = metrics.mean_absolute_error(y_train, y_train_pred_ridge)
test_rmse_ridge = np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridge))
print('Training Error: '+ str(train_rmse_ridge) )
print('Testing Error: '+ str(test_rmse_ridge) )

Training Error: 8435285439.610875
Testing Error: 20584632107.038567


In [41]:
#Calculate sum of coefficients for this model

In [42]:
lasso_coef01 = pd.DataFrame(data=lasso.coef_).T
lasso_coef01.columns = X_train_scaled.columns
lasso_coef01 = lasso_coef01.T.sort_values(by=0).T
#lasso_coef01.plot(kind='bar', title='Modal Coefficients', legend=False, figsize=(16,8))
lasso_coef01.T

Unnamed: 0,0
sqft_lot15^2,-25930660000.0
sqft_lot sqft_lot15,-17869150000.0
sqft_lot waterfront,-11844770000.0
view sqft_lot15,-11244210000.0
sqft_above sqft_lot15,-9042343000.0
floors sqft_lot15,-8230898000.0
sqft_living sqft_lot15,-7269173000.0
sqft_lot^2,-7176319000.0
sqft_lot view,-6110454000.0
bathrooms sqft_lot15,-5700345000.0


We want to pick our best model, but this is more complicated than just choosing between linear regression, Lasso, or Ridge. We now have to also consider the different models that we get from different alpha values for Ridge and Lasso.


How do we determine the best model that will not overfit to the training data? 

___

## Cross Validation

Cross-validation is a statistical method used to protect against overfitting a predictive model, particularly in a case where the amount of data may be limited. In cross-validation, you make a fixed number of folds (or partitions) of the data, run the analysis on each fold, and then average the overall error estimate.

### Steps for K-fold cross-validation



1. Split the dataset into K **equal** partitions (or "folds").
2. Use fold 1 as the **testing set** and the union of the other folds as the **training set**.
3. Calculate **testing accuracy**.
4. Repeat steps 2 and 3 K times, using a **different fold** as the testing set each time.
5. Use the **average testing accuracy** as the estimate of out-of-sample accuracy.

Diagram of **5-fold cross-validation:**

<img src="https://miro.medium.com/max/1354/1*qPMFLEbvc8QQf38Cf77wQg.png">

In [53]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=False).split(range(25))

In [54]:
# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
    print('{:^9} {} {:^25}'.format(iteration, data[0], str(data[1])))

Iteration                   Training set observations                   Testing set observations
    1     [ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [0 1 2 3 4]       
    2     [ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24]        [5 6 7 8 9]       
    3     [ 0  1  2  3  4  5  6  7  8  9 15 16 17 18 19 20 21 22 23 24]     [10 11 12 13 14]     
    4     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 20 21 22 23 24]     [15 16 17 18 19]     
    5     [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]     [20 21 22 23 24]     


- Dataset contains **25 observations** (numbered 0 through 24)
- 5-fold cross-validation, thus it runs for **5 iterations**
- For each iteration, every observation is either in the training set or the testing set, **but not both**
- Every observation is in the testing set **exactly once**

### Comparing cross-validation to train/test split



Advantages of **cross-validation:**

- More accurate estimate of out-of-sample accuracy
- More "efficient" use of data (every observation is used for both training and testing)

Advantages of **train/test split:**

- Runs K times faster than K-fold cross-validation
- Simpler to examine the detailed results of the testing process

### Cross-validation recommendations



1. K can be any number, but **K=10** is generally recommended
2. For classification problems, **stratified sampling** is recommended for creating the folds
    - Each response class should be represented with equal proportions in each of the K folds
    - scikit-learn's `cross_val_score` function does this by default

In [55]:
from sklearn.linear_model import LassoCV, RidgeCV

In [56]:
lassoCV_model = LassoCV(cv=5, random_state=0, verbose=1)
lassoCV_model.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.4s finished
  positive)


LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=0, selection='cyclic',
        tol=0.0001, verbose=1)

In [57]:
lassoCV_model.alpha_

0.0003732575008003002

In [58]:
RidgeCV_model = RidgeCV(cv=5)
RidgeCV_model.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [59]:
RidgeCV_model.alpha_

10.0

Now that we have used cross validation to help us determine the best **alpha** for Ridge and Lasso, we can then use those fitted models to compare on our test set.  

## Improvements to cross-validation

**Repeated cross-validation**

- Repeat cross-validation multiple times (with **different random splits** of the data) and average the results
- More reliable estimate of out-of-sample performance by **reducing the variance** associated with a single trial of cross-validation

**Creating a hold-out set**

- "Hold out" a portion of the data **before** beginning the model building process
- Locate the best model using cross-validation on the remaining data, and test it **using the hold-out set**
- More reliable estimate of out-of-sample performance since hold-out set is **truly out-of-sample**

**Feature engineering and selection within cross-validation iterations**

- Normally, feature engineering and selection occurs **before** cross-validation
- Instead, perform all feature engineering and selection **within each cross-validation iteration**
- More reliable estimate of out-of-sample performance since it **better mimics** the application of the model to out-of-sample data


## Resources


- scikit-learn documentation: [Cross-validation](http://scikit-learn.org/stable/modules/cross_validation.html), [Model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html)
- scikit-learn issue on GitHub: [MSE is negative when returned by cross_val_score](https://github.com/scikit-learn/scikit-learn/issues/2439)
- Section 5.1 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/) (11 pages) and related videos: [K-fold and leave-one-out cross-validation](https://www.youtube.com/watch?v=nZAM5OXrktY&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (14 minutes), [Cross-validation the right and wrong ways](https://www.youtube.com/watch?v=S06JpVoNaA0&list=PL5-da3qGB5IA6E6ZNXu7dp89_uv8yocmf) (10 minutes)
- Scott Fortmann-Roe: [Accurately Measuring Model Prediction Error](http://scott.fortmann-roe.com/docs/MeasuringError.html)
- Machine Learning Mastery: [An Introduction to Feature Selection](http://machinelearningmastery.com/an-introduction-to-feature-selection/)
- Harvard CS109: [Cross-Validation: The Right and Wrong Way](https://github.com/cs109/content/blob/master/lec_10_cross_val.ipynb)
- Journal of Cheminformatics: [Cross-validation pitfalls when selecting and assessing regression and classification models](http://www.jcheminf.com/content/pdf/1758-2946-6-10.pdf)