## 5.3.4 The Bootstrap

This code is a translation of chapter five of ISL from https://github.com/hardikkamboj

In [36]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
portfolio = pd.read_csv(r'../data/Portfolio.csv')
print(portfolio.shape)
portfolio.head()

(100, 2)


Unnamed: 0,X,Y
0,-0.895251,-0.234924
1,-1.562454,-0.885176
2,-0.41709,0.271888
3,1.044356,-0.734198
4,-0.315568,0.841983


In [22]:
# we first define a function equivalent to func alpha defined in the Lab, i would recommend you to go through the function in lab
# This function takes two arguements, data and indeces, this indices are used to calculate the estimate for alpha for this bootstrap

In [38]:
def alpha(data,index):
    X = data['X'].loc[index]
    y = data['Y'].loc[index]
    
    return (np.var(y) - np.cov(X,y)[0][1]) / (np.var(X) + np.var(y) - 2*(np.cov(X,y)[0][1]))

In [39]:
# equivalent to sample function in lab
def get_indices(data,num_samples):
    return  np.random.choice(data.index, num_samples, replace=True)

In [42]:
np.arange(0,100)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [43]:
alpha(portfolio,np.arange(0,100))

0.5766511516104116

In [53]:
#np.random.seed(2)
alpha(portfolio,get_indices(portfolio,100))

0.43481105466741643

In [66]:
# there is no built in function like boot, so, we will define one
def boot(data,func,R):
    estimates = []
    for i in range(R):
        estimates.append(func(data,get_indices(data,100)))
    bootstrap_statistics = {'estimated_value':np.mean(estimates),'std_error':np.std(estimates)}   
    return bootstrap_statistics, estimates

In [67]:
np.random.seed(0)
results = boot(portfolio,alpha,1000)
results[0]
results[1]

[0.560336658007497,
 0.6517460709422399,
 0.6419836035890761,
 0.5265088969761127,
 0.612948131460442,
 0.5220990429150474,
 0.46774999790450295,
 0.7534400396651767,
 0.5174396271611135,
 0.34067531376902926,
 0.5546745734082348,
 0.5072637067599198,
 0.45677697004420914,
 0.8423230879591778,
 0.5143789540560136,
 0.5805341099440867,
 0.5513765217561827,
 0.6162375108796402,
 0.6564123035990482,
 0.45009758976211567,
 0.4126220016476827,
 0.4861767351446104,
 0.616857207263681,
 0.631811734571275,
 0.5070456037771566,
 0.5365819164424747,
 0.5384000563157036,
 0.6125353806080611,
 0.4759029693761378,
 0.567682587282195,
 0.5966444325068891,
 0.4478063859011215,
 0.5937119675282219,
 0.5329203923725829,
 0.5511526764494888,
 0.6785966281677543,
 0.5669849533897657,
 0.5239220480311275,
 0.49284051774966436,
 0.5741044120632086,
 0.5497199987174151,
 0.44702787492540713,
 0.5810848676405479,
 0.6214292190155755,
 0.6578019726680298,
 0.5365414409490076,
 0.6694617437525653,
 0.405579070

### Estimating the Accuracy of a Linear Regression Model


In [68]:
data = pd.read_csv(r'../data/Auto.csv')

In [34]:
# auto data used earlier in the notebook
data = data.reset_index()
data.head()

Unnamed: 0,index,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [149]:
# similar to boot.fn in lab

def get_estimates(data,index):
    X = data['horsepower'].loc[index]
    y = data['mpg'].loc[index]
    
    lr = LinearRegression()
    lr.fit(X.to_frame(),y)
    intercept = lr.intercept_
    coef = lr.coef_
    return [intercept,coef]

In [150]:
get_estimates(data,np.arange(0,392))

[39.93586102117047, array([-0.15784473])]

In [151]:
#modifying the boot mentioned that we used earlier
def boot(data,func,R):
    intercept = []
    coeff = []
    for i in range(R):
        intercept.append(func(data,get_indices(data,100))[0])
        coeff.append(func(data,get_indices(data,100))[1]) 
    intercept_statistics = {'estimated_value':np.mean(intercept),'std_error':np.std(intercept)}   
    coeff_statistices = {'estimated_value':np.mean(coeff),'std_error':np.std(coeff)}   
    return {'intercept':intercept_statistics,'coeff_statistices':coeff_statistices}

In [152]:
results = boot(data,get_estimates,1000)

In [153]:
print('Result for intercept ',results['intercept'])
print('Result for coefficient term ',results['coeff_statistices'])

Result for intercept  {'estimated_value': 39.971231978520734, 'std_error': 1.7370193344068778}
Result for coefficient term  {'estimated_value': -0.15881709682801645, 'std_error': 0.014778176069058804}


In [143]:
# for bootstraping we have std error 1.69 for intercept, and 0.0144 for ceoff

In [148]:
# for lets see what the model predicts
import statsmodels.api as sm
X = data['horsepower']
y = data['mpg']

X = sm.add_constant(X)
results = sm.OLS(y,X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     599.7
Date:                Wed, 08 Jul 2020   Prob (F-statistic):           7.03e-81
Time:                        11:03:19   Log-Likelihood:                -1178.7
No. Observations:                 392   AIC:                             2361.
Df Residuals:                     390   BIC:                             2369.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         39.9359      0.717     55.660      0.0

In [155]:
# standard error are less for estimations using model
# But still bootstrap estimates are mode preices, because they don't rely on assumptions, while there is a lot of assumptions
# when calculating std errors using sm(model)

In [158]:
# Adding a quad term
# similar to boot.fn in lab
data['horsepower_2'] = data['horsepower']**2

def get_estimates(data,index):
    X = data[['horsepower','horsepower_2']].loc[index]
    y = data['mpg'].loc[index]
    
    lr = LinearRegression()
    lr.fit(X,y)
    intercept = lr.intercept_
    coef = lr.coef_
    return [intercept,coef]

get_estimates(data,np.arange(0,392))

#modifying the boot mentioned that we used earlier
def boot(data,func,R):
    intercept = []
    coeff_1 = []
    coeff_2 = []
    for i in range(R):
        intercept.append(func(data,get_indices(data,100))[0])
        coeff_1.append(func(data,get_indices(data,100))[1][0]) 
        coeff_2.append(func(data,get_indices(data,100))[1][1])
    intercept_statistics = {'estimated_value':np.mean(intercept),'std_error':np.std(intercept)}   
    coeff_1_statistices = {'estimated_value':np.mean(coeff_1),'std_error':np.std(coeff_1)}   
    coeff_2_statistices = {'estimated_value':np.mean(coeff_2),'std_error':np.std(coeff_2)}   
    return {'intercept':intercept_statistics,'coeff_1_statistices':coeff_1_statistices,'coeff_2_statistics':coeff_2_statistices}

results = boot(data,get_estimates,1000)

print('Result for intercept ',results['intercept'])
print('Result for coefficient term horsepower',results['coeff_1_statistices'])
print('Result for coefficient term horsepower**2',results['coeff_2_statistics'])


# for bootstraping we have std error 1.69 for intercept, and 0.0144 for ceoff

# for lets see what the model predicts
import statsmodels.api as sm
X = data[['horsepower','horsepower_2']]
y = data['mpg']

X = sm.add_constant(X)
results = sm.OLS(y,X).fit()
print(results.summary())

Result for intercept  {'estimated_value': 57.22194337269758, 'std_error': 4.268750160472158}
Result for coefficient term horsepower {'estimated_value': -0.47175109701024376, 'std_error': 0.07214757143694592}
Result for coefficient term horsepower**2 {'estimated_value': 0.0012511071230436879, 'std_error': 0.00026005778901839197}
                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.688
Model:                            OLS   Adj. R-squared:                  0.686
Method:                 Least Squares   F-statistic:                     428.0
Date:                Wed, 08 Jul 2020   Prob (F-statistic):           5.40e-99
Time:                        11:20:43   Log-Likelihood:                -1133.2
No. Observations:                 392   AIC:                             2272.
Df Residuals:                     389   BIC:                             2284.
Df Model:                           2 

  return ptp(axis=axis, out=out, **kwargs)
