 ## 5.3.1 The Validation Set Approach

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import statsmodels.formula.api as smf

In [57]:
auto_df = pd.read_csv('auto.csv')
auto_df['horsepower'] = pd.to_numeric(auto_df['horsepower'], errors='coerce')
auto_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [50]:
train, test = train_test_split(auto_df, test_size=0.5, random_state=2)
lin_reg = smf.ols(formula='mpg ~ horsepower', data=train).fit()
((test['mpg'] - lin_reg.predict(test)) ** 2).mean()

24.33064225172346

In [37]:
lin_reg = smf.ols(formula='mpg ~ np.power(horsepower, 2) + horsepower', data=train).fit()
((test['mpg'] - lin_reg.predict(test)) ** 2).mean()

19.209652383908576

In [39]:
lin_reg = smf.ols(formula='mpg ~ np.power(horsepower, 3) + horsepower', data=train).fit()
((test['mpg'] - lin_reg.predict(test)) ** 2).mean()

19.65547708519472

## 5.3.2 Leave-One-Out Cross Validation

In [407]:
errs = []
for train_idx, test_idx in KFold(n_splits=auto_df.shape[0]).split(auto_df):
    train_dat = auto_df.iloc[train_idx]
    test_dat = auto_df.iloc[test_idx]
    lin_reg = smf.ols(formula='mpg ~ horsepower', data=train_dat).fit()
    errs.append(((test_dat['mpg'] - lin_reg.predict(test_dat)) ** 2).mean())

pd.DataFrame(errs).mean()[0]

24.231513517929212

In [409]:
errs = []
order = 5
for o in np.arange(1, order + 1):
    o_errs = []
    for train_idx, test_idx in KFold(n_splits=auto_df.shape[0]).split(auto_df):
        train_dat = auto_df.iloc[train_idx]
        test_dat = auto_df.iloc[test_idx]
        formula = 'mpg ~ np.power(horsepower, {}) + horsepower'.format(o)
        lin_reg = smf.ols(formula=formula, data=train_dat).fit()
        o_errs.append(((test_dat['mpg'] - lin_reg.predict(test_dat)) ** 2).mean())
    errs.append(pd.DataFrame(o_errs).mean()[0])
    
errs

[24.23151351792922,
 19.24821312448964,
 19.427138604618637,
 19.732435277021285,
 20.07129811316734]

## 5.3.3 K-Fold Cross Validation

In [410]:
errs = []
order = 5
for o in np.arange(1, order + 1):
    o_errs = []
    for train_idx, test_idx in KFold(n_splits=10).split(auto_df):
        train_dat = auto_df.iloc[train_idx]
        test_dat = auto_df.iloc[test_idx]
        formula = 'mpg ~ np.power(horsepower, {}) + horsepower'.format(o)
        lin_reg = smf.ols(formula=formula, data=train_dat).fit()
        o_errs.append(((test['mpg'] - lin_reg.predict(test_dat)) ** 2).mean())
    errs.append(pd.DataFrame(o_errs).mean()[0])
    
errs

[28.21589473841133,
 22.14238910645165,
 22.660858346264526,
 23.169316496852,
 23.596680293888777]

## 5.3.4 The Bootstrap

In [120]:
portfolio_dat = pd.read_csv('portfolio.csv')
portfolio_dat = portfolio_dat.drop(portfolio_dat.columns[0], axis=1)
portfolio_dat.head()

Unnamed: 0,X,Y
0,-0.895251,-0.234924
1,-1.562454,-0.885176
2,-0.41709,0.271888
3,1.044356,-0.734198
4,-0.315568,0.841983


In [394]:
def boot(data, func, r):
    i = 0
    stats = pd.DataFrame()
    while i < r:
        sample = data.sample(data.shape[0], replace=True)
        value = func(sample)
        stats[i] = value
        i+=1
    std = stats.std(axis=1)
    return pd.DataFrame(data={'original': func(data), 'std_err' : std})

In [396]:
def alpha(data):
    cov = data.cov()
    value = (cov['Y']['Y'] - cov['X']['Y']) / (cov['X']['X'] + cov['Y']['Y'] - (2 * cov['X']['Y']))
    return pd.Series(value, index=['alpha'])

boot(portfolio_dat, alpha, 1000)

Unnamed: 0,original,std_err
alpha,0.575832,0.087763


In [398]:
def reg_coeff(data):
    lin_reg = smf.ols(formula='mpg ~ horsepower', data=data).fit()
    return lin_reg.params

boot(auto_df, reg_coeff, 1000)

Unnamed: 0,original,std_err
Intercept,39.935861,0.83958
horsepower,-0.157845,0.007339


In [361]:
smf.ols(formula='mpg ~ horsepower', data=auto_df).fit().summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,599.7
Date:,"Mon, 24 Sep 2018",Prob (F-statistic):,7.03e-81
Time:,12:25:56,Log-Likelihood:,-1178.7
No. Observations:,392,AIC:,2361.0
Df Residuals:,390,BIC:,2369.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.9359,0.717,55.660,0.000,38.525,41.347
horsepower,-0.1578,0.006,-24.489,0.000,-0.171,-0.145

0,1,2,3
Omnibus:,16.432,Durbin-Watson:,0.92
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17.305
Skew:,0.492,Prob(JB):,0.000175
Kurtosis:,3.299,Cond. No.,322.0


In [401]:
def quad_reg_coeff(data):
    lin_reg = smf.ols(formula='mpg ~ np.power(horsepower, 2) + horsepower', data=data).fit()
    return lin_reg.params

boot(auto_df, quad_reg_coeff, 1000)

Unnamed: 0,original,std_err
Intercept,56.9001,2.079696
"np.power(horsepower, 2)",0.001231,0.000118
horsepower,-0.46619,0.032899


In [403]:
smf.ols(formula='mpg ~ np.power(horsepower, 2) + horsepower', data=auto_df).fit().summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.688
Model:,OLS,Adj. R-squared:,0.686
Method:,Least Squares,F-statistic:,428.0
Date:,"Mon, 24 Sep 2018",Prob (F-statistic):,5.4000000000000005e-99
Time:,12:37:23,Log-Likelihood:,-1133.2
No. Observations:,392,AIC:,2272.0
Df Residuals:,389,BIC:,2284.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,56.9001,1.800,31.604,0.000,53.360,60.440
"np.power(horsepower, 2)",0.0012,0.000,10.080,0.000,0.001,0.001
horsepower,-0.4662,0.031,-14.978,0.000,-0.527,-0.405

0,1,2,3
Omnibus:,16.158,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30.662
Skew:,0.218,Prob(JB):,2.2e-07
Kurtosis:,4.299,Cond. No.,129000.0
