# Chapter 5 Resampling Methods

In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

import statsmodels.formula.api as smf
import statsmodels.api as sm

  from pandas.core import datetools


In [80]:
auto = pd.read_csv('Data/Auto.csv')
auto = auto[['mpg', 'horsepower']]
auto = auto.replace({'?': np.nan}).dropna().astype(float)

In [81]:
auto.head()

Unnamed: 0,mpg,horsepower
0,18.0,130.0
1,15.0,165.0
2,18.0,150.0
3,16.0,150.0
4,17.0,140.0


## Labs

### 5.3.1 The Validation Set Approach

In [82]:
X = auto.horsepower
# Generating Polynomial Features
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X.values.reshape(-1, 1))

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, auto.mpg.ravel(),test_size=.5, random_state=0)

In [84]:
# Scikit-Learn Linear Regression
regr = skl_lm.LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
mse = mean_squared_error(y_test, pred)

In [85]:
mse

18.763031346897744

### 5.3.2 Leave One Out Cross Validation (LOOCV)

In [86]:
regr = skl_lm.LinearRegression()
loo = LeaveOneOut()
loo.get_n_splits(X_poly)

392

In [87]:
score = cross_val_score(regr, X_poly, auto.mpg, cv=loo, scoring='neg_mean_squared_error').mean()

In [88]:
score

-19.248213124489411

The average mean-squared-error using the LOOCV approach actually performs slightly worse that the Validation set approach. This is because the 392 splits of our data, when juxtaposed, are incredibly similar, leaving our model prone to great variance.

### 5.3.3 k-Fold Cross-Validation

In [89]:
kf = KFold(n_splits=10, random_state=0, shuffle=False)
kf.get_n_splits(X_poly)

10

In [90]:
score = cross_val_score(regr, X_poly, auto.mpg, cv=kf, scoring='neg_mean_squared_error').mean()
score

-21.235840055802111

### 5.3.4 The Bootstrap

In [91]:
def alpha(data, num_samples=100):
    # make a num_samples random choice of indices WITH REPLACEMENT
    indices = np.random.choice(data.index, num_samples, replace=True)
    
    X = data.X[indices].values
    Y = data.Y[indices].values
    
    # np.cov returns full cov matrix we need [0][1] cov(x,y)
    return (np.var(Y) - np.cov(X,Y)[0][1])/(np.var(X) + np.var(Y) - 2*np.cov(X,Y)[0][1])

In [92]:
portfolio = pd.read_csv('Data/Portfolio.csv')

In [93]:
alpha(portfolio)

0.61799249553851809

Since Scikit-Learn no longer supports Bootstrap, we'll write our own pseudo-bootstrap:

In [106]:
def boot(data, statistic_calculator, num_samples = 1000):
    stat_samples = []
    for sample in range(num_samples):
        stat_samples.append(statistic_calculator(data))
    
    se_estimate = np.std(stat_samples)
    print('Bootstrapped Std. Error(s) =', se_estimate)

In [111]:
np.random.seed(0)
boot(portfolio, alpha)

Bootstrapped Std. Error(s) = 0.089804091613


---

Now, we apply the Bootstrap method to estimate the accuracy of the coefficients $b_0$ and $B_1$, the slope and intercept terms that use horsepower to predict mpg.

In [112]:
auto = pd.read_csv('Data/Auto.csv')
auto['horsepower'] = pd.to_numeric(auto.horsepower, errors='coerce')
auto['mpg'] = pd.to_numeric(auto.mpg, errors='coerce')
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [140]:
est = smf.ols('mpg ~ horsepower', auto).fit()
est.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.9359,0.717,55.660,0.000,38.525,41.347
horsepower,-0.1578,0.006,-24.489,0.000,-0.171,-0.145
