In [40]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize,
poly)
from sklearn.model_selection import train_test_split

In [2]:
from functools import partial
from sklearn.model_selection import \
(cross_validate,
KFold,
ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

# VS approach

In [4]:
Auto = load_data('Auto')
Auto_train, Auto_valid = train_test_split(Auto,
                                          test_size=int(Auto.shape[0]/2),
                                          random_state=0)

In [5]:
# Fit linear regression with training set
hp_mm = MS(['horsepower'])
X_train = hp_mm.fit_transform(Auto_train)
y_train = Auto_train['mpg']
model = sm.OLS(y_train, X_train)
results = model.fit()

In [7]:
X_valid = hp_mm.transform(Auto_valid)
y_valid = Auto_valid['mpg']
valid_pred = results.predict(X_valid)
np.mean((y_valid - valid_pred)**2) # Validation MSE of our model - estimate of true MSE

23.61661706966988

In [8]:
def evalMSE(terms,
            response,
            train,
            test):
    mm = MS(terms)
    X_train = mm.fit_transform(train)
    y_train = train[response]
    X_test = mm.transform(test)
    y_test = test[response]
    results = sm.OLS(y_train, X_train).fit()
    test_pred = results.predict(X_test)
    return np.mean((y_test - test_pred)**2)

In [9]:
MSE = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)],
    'mpg',
    Auto_train,
    Auto_valid)
MSE

array([23.61661707, 18.76303135, 18.79694163])

In [12]:
Auto_train, Auto_valid = train_test_split(Auto,
                                          test_size=196,
                                          random_state=3) # Different split.
MSE = np.zeros(3)
for idx, degree in enumerate(range(1, 4)):
    MSE[idx] = evalMSE([poly('horsepower', degree)], # Degrees tested: 1,2,3
                       'mpg',
                       Auto_train,
                       Auto_valid)
MSE

array([20.75540796, 16.94510676, 16.97437833])

# CV approach - sklearn

In [13]:
# ISLP provides with a wrapper, sklearn_sm(), that enables us to easily use the cross-validation tools of sklearn with models fit by statsmodels

In [14]:
# It can take two additional optional arguments: model_str which can be used to specify a formula, and model_args which should be a dictionary of additional arguments used when fitting the model

In [17]:
hp_model = sklearn_sm(sm.OLS,
                      MS(['horsepower']))

X, Y = Auto.drop(columns=['mpg']), Auto['mpg']

cv_results = cross_validate(hp_model, # an object with the appropriate fit(), predict(), and score() methods
                            X,
                            Y,
                            cv=Auto.shape[0]) # Our K. As K=n here, we are applying LOOCV
cv_err = np.mean(cv_results['test_score'])
cv_err

24.231513517929216

In [18]:
# The cross_validate() function produces a dictionary with several components
cv_results

{'fit_time': array([0.0073843 , 0.00432014, 0.01768398, 0.01152992, 0.01166439,
        0.00457335, 0.01327014, 0.01237726, 0.01612473, 0.01587605,
        0.        , 0.01581407, 0.01879883, 0.0130024 , 0.00847316,
        0.01106977, 0.00747442, 0.        , 0.        , 0.01002908,
        0.00174212, 0.01614571, 0.01617146, 0.01580119, 0.00802422,
        0.0068171 , 0.00907397, 0.0160656 , 0.01610947, 0.        ,
        0.        , 0.02428102, 0.00955105, 0.00160861, 0.        ,
        0.        , 0.        , 0.        , 0.01608086, 0.        ,
        0.        , 0.01596427, 0.01610518, 0.01595998, 0.01586843,
        0.00636387, 0.        , 0.01740456, 0.01473856, 0.01002741,
        0.        , 0.        , 0.01567435, 0.01096988, 0.00501657,
        0.        , 0.        , 0.01588964, 0.0137918 , 0.01249528,
        0.01052451, 0.01254725, 0.01003623, 0.01174688, 0.01034522,
        0.00967431, 0.01106071, 0.00724578, 0.00544477, 0.00552511,
        0.00625134, 0.00360203, 0.00

In [27]:
cv_error = np.zeros(5)
H = np.array(Auto['horsepower'])
M = sklearn_sm(sm.OLS)

for i, degree in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(degree+1)) # Starting from linearity, adding extra term of higher degree per iteration.
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=Auto.shape[0])
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error

array([24.23151352, 19.24821312, 19.33498406, 19.42443031, 19.03320903])

In [28]:
# outer() method to np.power() function. It takes 2 arrays as arguments and forms a larger array where the operation is applied to each pair of elementsof the two arrays

In [29]:
A = np.array([3, 5, 9])
B = np.array([2, 4])
np.add.outer(A, B)

array([[ 5,  7],
       [ 7,  9],
       [11, 13]])

In [31]:
cv_error = np.zeros(5)
cv = KFold(n_splits=10,
           shuffle=True,
           random_state=0) # use same splits for each degree

for i, d in enumerate(range(1,6)):
    X = np.power.outer(H, np.arange(d+1))
    M_CV = cross_validate(M,
                          X,
                          Y,
                          cv=cv)
    cv_error[i] = np.mean(M_CV['test_score'])
cv_error

array([24.20766449, 19.18533142, 19.27626666, 19.47848402, 19.13722633])

In [32]:
validation = ShuffleSplit(n_splits=1,
                          test_size=196,
                          random_state=0) # Function to implement VS approach into cross_validate()
results = cross_validate(hp_model,
                         Auto.drop(['mpg'], axis=1),
                         Auto['mpg'],
                         cv=validation);
results['test_score']

array([23.61661707])

In [36]:
# Estimate variability in the test error

validation = ShuffleSplit(n_splits=10, # 10 splits -> 10 validation-test sets with 196 obs each -> 10 splits like k-fold but with randomness.
                          test_size=196,
                          random_state=0)
# ShuffleSplit randomly shuffles the data before splitting, allowing for overlapping samples across folds (differently from KFold() where data is shuffled but then splitting is structured).

results = cross_validate(hp_model,
                         Auto.drop(['mpg'], axis=1),
                         Auto['mpg'],
                         cv=validation)

results['test_score'].mean(), results['test_score'].std()
# mean across 10 folds.

(23.802232661034164, 1.4218450941091847)

##### Not a valid variability estimate because ShuffleSplit allows for overlapping across folds or not coverage of some obs, generating correlation between them
##### std() here captures the variation due to randomness in fold selection (like running multiple random experiments) -> Monte Carlo variation incurred by picking diferent random folds

# The Bootstrap

In [38]:
# Example of investment in X and Y (described in The notebook) - Portfolio data frame

In [45]:
Portfolio = pd.read_csv('Portfolio.csv')

# This function returns an estimate of alpha applying the minimum variance formula to the observations indexed by the argument idx
def alpha_func(D, idx):
    cov_ = np.cov(D[['X','Y']].loc[idx], rowvar=False)
    return ((cov_[1,1] - cov_[0,1]) /
            (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))

In [46]:
alpha_func(Portfolio, range(100)) # estimating alpha using all 100 obs

0.57583207459283

In [47]:
rng = np.random.default_rng(0)
alpha_func(Portfolio,
           rng.choice(100,
                      100,
                      replace=True)) # with replacement.

0.6074452469619003

In [49]:
# Computing the bootstrap standard error for arbitrary functions that take only a data frame as an argument

def boot_SE(func,
            D,
            n=None,
            B=1000,
            seed=0):
    
    rng = np.random.default_rng(seed)
    first_, second_ = 0, 0
    n = n or D.shape[0]
    
    for _ in range(B): # Value of the counter '_' is not important. It just makes sure the loop is executed B times.
        idx = rng.choice(D.index,
                         n,
                         replace=True)
        
        value = func(D, idx)
        first_ += value
        second_ += value**2
    return np.sqrt(second_ / B - (first_ / B)**2)

In [50]:
alpha_SE = boot_SE(alpha_func,
                   Portfolio,
                   B=1000,
                   seed=0)
alpha_SE

0.09118176521277668