# Question 1 - Bootstrap

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.utils import resample
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv("../../data/penn_jae.csv")
data['log_inuidur1'] = np.log(data['inuidur1'])

#Creating T4
data = data[data['tg'].isin([0, 4])]
data['T4'] = np.where(data['tg'] == 4, 1, 0)
print(data[['T4', 'tg']])  

#dep is categorical
data['dep'] = pd.Categorical(data['dep'])

       T4  tg
0       0   0
3       0   0
4       0   0
11      1   4
12      0   0
...    ..  ..
13904   1   4
13905   1   4
13906   0   0
13910   1   4
13911   0   0

[5099 rows x 2 columns]


### (1) Function to get coefficients

In [4]:
#-------------------------------------#
#---Function to get coefficients------#
#-------------------------------------#
def get_coef(data, exp, numlist):
    results = []
    model = LinearRegression().fit(data[exp], data['log_inuidur1'])

    for i in numlist:
        results.append(model.coef_[i])

    return results
#given the position of desired coefficients in numlist, this function gets the associated coefficients for regression defined by exp 

### (2) Bootstrapping 

In [5]:
#----------------------#
#---Bootstrapping------#
#----------------------#
random.seed(1234)
nboot = 10000
point_estimates = []

for k in range(nboot):
    sampled_data = resample(data, replace=True)
    exp = ['T4', 'female', 'black', 'othrace', 'dep', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd']
    result = get_coef(sampled_data, exp, [0, 1, 2])

    #appending results
    point_estimates.append(result)

### (3) Statistics

In [6]:
#-------------------#
#---Statistics------#
#-------------------#
point_estimates = np.array(point_estimates)
mean_T4 = np.mean(point_estimates[:, 0])
mean_female = np.mean(point_estimates[:, 1])
mean_black = np.mean(point_estimates[:, 2])

sd_T4 = np.std(point_estimates[:, 0], ddof=1)
sd_female = np.std(point_estimates[:, 1], ddof=1)
sd_black = np.std(point_estimates[:, 2], ddof=1)

### (4) Results

In [7]:
#--------------#
#---Table------#
#--------------#
table = pd.DataFrame(columns=["Variable", "Coefficient (bootstrap)", "Standard error (bootstrap)"])
table.loc[0] = ["T4", mean_T4, sd_T4]
table.loc[1] = ["Female", mean_female, sd_female]
table.loc[2] = ["Black", mean_black, sd_black]

print(table)

  Variable  Coefficient (bootstrap)  Standard error (bootstrap)
0       T4                -0.071609                    0.035251
1   Female                 0.125620                    0.035109
2    Black                -0.292796                    0.058986
