## 5.3.4 The Bootstrap

This code is a translation of chapter five of ISL from https://github.com/hardikkamboj

In [126]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

- First me load the data into 

In [135]:
# uploading the data
Penn = pd.read_csv("../data/penn_jae.dat" , sep='\s', engine='python')
print(Penn.shape)
Penn.head()

(13913, 24)


Unnamed: 0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,...,q5,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld
0,10824,0,18,18,0,0,0,0,2,0,...,0,0,0,0,0,0,0,1,0,
1,10635,2,7,3,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,
2,10551,5,18,6,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,
3,10824,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,
4,10747,0,27,27,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,


- We filter the data for the tratment group number 4 and the control group. Also we create the variable **T4**, change the type to the **dep** variable and convert into log the cariable **inuidur1**. 

In [128]:
#Subsetting the data in treatment group 4 and control group
Penn = Penn[ (Penn['tg'] == 4) | (Penn['tg'] == 0) ]
Penn['T4'] = (Penn[['tg']]==4).astype(int)

# Create category variable
Penn['dep'] = Penn['dep'].astype( 'category' )
#transform the inuidur1 varaible to log to run into the model 
Penn['inuidur1'] = np.log(Penn['inuidur1'])

Penn.head()

Unnamed: 0,abdt,tg,inuidur1,inuidur2,female,black,hispanic,othrace,dep,q1,...,q6,recall,agelt35,agegt54,durable,nondurable,lusd,husd,muld,T4
0,10824,0,2.890372,18,0,0,0,0,2,0,...,0,0,0,0,0,0,1,0,,0
3,10824,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,0
4,10747,0,3.295837,27,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,0
11,10607,4,2.197225,9,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,,1
12,10831,0,3.295837,27,0,0,0,0,1,0,...,0,0,1,1,0,1,0,0,,0


- Create the **get_indices** function to get a sample with repetition 

In [129]:
# equivalent to sample function in lab
def get_indices(data,num_samples):
    return  np.random.choice(data.index, num_samples, replace=True)

- it works

In [130]:
get_indices(Penn, 13913)

array([ 1672,  8589, 12868, ..., 12462,  6370,  6232], dtype=int64)

### Estimating the Accuracy of a Linear Regression Model


- importing the libraries

In [131]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import numpy as np

- build the model and printing the result

In [132]:
model = "inuidur1~T4+ (female+black+othrace+C(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)"

model_results = smf.ols( model , data = Penn ).fit().get_robustcov_results(cov_type = "HC1")

In [64]:
print(model_results.summary())

                            OLS Regression Results                            
Dep. Variable:               inuidur1   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     15.32
Date:                Fri, 12 Nov 2021   Prob (F-statistic):           6.43e-42
Time:                        01:18:59   Log-Likelihood:                -8128.2
No. Observations:                5099   AIC:                         1.629e+04
Df Residuals:                    5082   BIC:                         1.640e+04
Df Model:                          16                                         
Covariance Type:                  HC1                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       1.7723      0.050     35.154      

- build the **get_estimates** function to run the model with the sample of **get_indices**

In [100]:
def get_estimates(data,index):
    data_1 = data.loc[index]

    model = "inuidur1~T4+ (female+black+othrace+C(dep)+q2+q3+q4+q5+q6+agelt35+agegt54+durable+lusd+husd)"

    model_results = smf.ols( model , data = data_1 ).fit().get_robustcov_results(cov_type = "HC1")

    T4_coef = model_results.summary2().tables[1]['Coef.']['T4']
    female_coef = model_results.summary2().tables[1]['Coef.']['female']
    black_coef = model_results.summary2().tables[1]['Coef.']['black']

    return [T4_coef,female_coef, black_coef]


    



- we build the **boot** function to run the model 1000 times with differents samples and finally get a mean of the coefficient of the **T4**,**FEMALE** and **BLACK** variables.

In [111]:
def boot(data,func,R):
    T4 = []
    female = []
    black = []
    for i in range(R):
        T4.append(func(data,get_indices(data,5099))[0])
        female.append(func(data,get_indices(data,5099))[1]) 
        black.append(func(data,get_indices(data,5099))[2]) 
    T4_statistics = {'estimated_value':np.mean(T4),'std_error':np.std(T4)}   
    female_statistices = {'estimated_value':np.mean(female),'std_error':np.std(female)}   
    black_statistices = {'estimated_value':np.mean(black),'std_error':np.std(black)} 
    return {'T4_statistices':T4_statistics,'female_statistices':female_statistices,'black_statistices':black_statistices}

- saving and printing the results

In [112]:
results = boot(Penn,get_estimates,1000)

In [115]:
print('Result for T4 ',results['T4_statistices'])
print('Result for Female ',results['female_statistices'])
print('Result for Black ',results['black_statistices'])


Result for T4  {'estimated_value': -0.07386034975497395, 'std_error': 0.035047911639941434}
Result for Female  {'estimated_value': 0.1378708139136148, 'std_error': 0.03478910203104117}
Result for Black  {'estimated_value': -0.3053560163804113, 'std_error': 0.06121784519409958}


- Finally we store on a table the result from the first estimation with the original dataset and then with the bootstrap estimates for the 3 target variables(**T4**,**Female**,**Black**). This will help to vizualise the results and compare. 


In [134]:
table2 = np.zeros((2, 6))
table2[0,0] = model_results.summary2().tables[1]['Coef.']['T4']
table2[0,1] = results['T4_statistices']['estimated_value']

table2[0,2] = model_results.summary2().tables[1]['Coef.']['female']
table2[0,3] = results['female_statistices']['estimated_value']

table2[0,4] = model_results.summary2().tables[1]['Coef.']['black']
table2[0,5] = results['black_statistices']['estimated_value']


table2[1,0] = model_results.summary2().tables[1]['Std.Err.']['T4']
table2[1,1] = results['T4_statistices']['std_error']

table2[1,2] = model_results.summary2().tables[1]['Std.Err.']['T4']
table2[1,3] = results['female_statistices']['std_error']

table2[1,4] = model_results.summary2().tables[1]['Std.Err.']['black']
table2[1,5] = results['black_statistices']['std_error']


table2 = pd.DataFrame(table2, columns = ["T4", "T4_boot", "Female", "Female_boot",'Black','Black_boot'], \
                      index = ["estimate","standard error"])
table2

Unnamed: 0,T4,T4_boot,Female,Female_boot,Black,Black_boot
estimate,-0.076206,-0.07386,0.138128,0.137871,-0.307905,-0.305356
standard error,0.035211,0.035048,0.035211,0.034789,0.059723,0.061218


- The results shows that the bootstrap estimates are very similat with the coefficient of the model with the original dataset.