## Part 1 - Bootstraping

In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import norm, t
from sklearn.utils import resample
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from itertools import combinations
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('framingham_heart_disease.csv')

### Section 1
Reminder from previous part:<br>
We would like to explore the relations between: Number of cigaretes per day (discrete), total cholesterol (continous), diaBP (diastolic BP) and sysBP (systolic BP - continous). <br>
Hence, our research question would be:
**What are the effects of number of cigaretes per day, total cholesterol and diaBP over sysBP**

Preparing the subset that we will work with, as done in part 3

In [3]:
df.dropna(subset=['cigsPerDay', 'totChol', 'diaBP', 'sysBP'], inplace=True)
df_train, df_test, y_train, y_test = train_test_split(df[['cigsPerDay', 'totChol', 'diaBP']], df[['sysBP']], test_size=0.9519115171916326, random_state=42)

#### section a - calculating confidence intervals based on the variance matrix and normal approximation 
Calculating the parameters

In [4]:
X = sm.add_constant(df_train)
model = sm.OLS(y_train,X)
model_res = model.fit()
model_res.params

const        -6.795017
cigsPerDay   -0.002428
totChol       0.055739
diaBP         1.520925
dtype: float64

In [5]:
# Since the variance is unknown we will compute it using the unbiased estimator as shown in tutorial 5
dof = len(df_train)-2
sigma_squared = np.dot(model_res.resid.to_numpy().T,model_res.resid.to_numpy())/(len(df_train)-4)
ci_t_percentile = norm().ppf(0.95)
C = np.linalg.inv(np.matmul(X.T,X))  # C = (X.T * X)^-1

In [6]:
for i, beta in enumerate(model_res.params):
    print(f"Confidence interval for beta_{i} is: [{beta - ci_t_percentile*np.sqrt(sigma_squared*C[i,i])},{beta + ci_t_percentile*np.sqrt(sigma_squared*C[i,i])}]")

Confidence interval for beta_0 is: [-20.120873608415245,6.530840172599772]
Confidence interval for beta_1 is: [-0.12499805793941654,0.12014199808401821]
Confidence interval for beta_2 is: [0.024470264676292097,0.08700841545378751]
Confidence interval for beta_3 is: [1.3755587622140237,1.6662906536246527]


#### section b - confidence intervals based on bootsrtap standart error

In [56]:
def cal_bootstrap_normal_se(B, df, features_cols, label_cols):
    df_len = len(df)
    num_param = len(df.columns)
    param_sum = np.zeros((num_param))
    param_sum2 = np.zeros((num_param))
    for i in range(B):
        boot_df = df.sample(replace=True, n=200)
        y = boot_df[label_cols]
        X = boot_df[features_cols]
        X['const'] = np.ones((df_len))
        X = X[['const', *features_cols]]
        model = sm.OLS(y,X)
        model_res = model.fit()
        param_sum = param_sum + model_res.params
        param_sum2 = param_sum2 + (model_res.params) ** 2
    return np.sqrt((1/B)*(param_sum2) - ((1/B)*(param_sum))**2)
        

In [57]:
bootstrap_se = cal_bootstrap_normal_se(400, pd.concat([df_train, y_train], axis=1),
                        features_cols=['cigsPerDay', 'totChol', 'diaBP'], label_cols=['sysBP'])

In [58]:
for i, beta in enumerate(model_res.params):
    print(f"Confidence interval based on bootstrap s.e for beta_{i} is: [{beta - ci_t_percentile*bootstrap_se[i]},{beta + ci_t_percentile*bootstrap_se[i]}]")

Confidence interval based on bootstrap s.e for beta_0 is: [-23.24972522909058,9.659691793275108]
Confidence interval based on bootstrap s.e for beta_1 is: [-0.12157323646605833,0.11671717661066]
Confidence interval based on bootstrap s.e for beta_2 is: [0.021400344513457188,0.09007833561662243]
Confidence interval based on bootstrap s.e for beta_3 is: [1.3426202113096413,1.6992292045290351]


#### section c - pivotals confidence intervals 

In [104]:
def cal_boot_pivot_quantile(B, df, features_cols, labels_cols, alpha):
    num_sample = len(df)
    q = 1 - alpha/2
    g_dict = defaultdict(list)
    q_list = [0] *(len(features_cols) + 1)
    # calculate the real sample param
    X = df[features_cols]
    X['const'] = np.ones((num_sample))
    X = X[['const', *features_cols]]
    Y = df[labels_cols]
    model = sm.OLS(Y, X)
    model_res = model.fit()
    b_r = model_res.params
    for i in range(B):
        new_df = df.sample(num_sample, replace=True)
        X = new_df[features_cols]
        X['const'] = np.ones((num_sample))
        X = X[['const', *features_cols]]
        Y = new_df[labels_cols]
        model = sm.OLS(Y, X)
        model_res = model.fit()
        for i in range(len(features_cols) + 1):
            g_dict[i].append((num_sample*0.5)*(model_res.params[i] - b_r[i]))
    for i in range(len(features_cols) + 1):
        q_list[i] = (np.quantile(g_dict[i], q), np.quantile(g_dict[i], alpha/2))
    return q_list

In [105]:
qunatiles_list = cal_boot_pivot_quantile(400, pd.concat([df_train, y_train], axis=1),
                        features_cols=['cigsPerDay', 'totChol', 'diaBP'], labels_cols=['sysBP'], alpha=0.05)

In [106]:
for i, beta in enumerate(model_res.params):
    print(f"Pivotal confidence intervals based on bootstrap for beta_{i} is: [{beta - qunatiles_list[i][0]/np.sqrt(len(df_train))},{beta - qunatiles_list[i][1]/np.sqrt(len(df_train))}]")

Pivotal confidence intervals based on bootstrap for beta_0 is: [-142.2246230709868,136.31292791267322]
Pivotal confidence intervals based on bootstrap for beta_1 is: [-0.9973885015651721,1.012397979801853]
Pivotal confidence intervals based on bootstrap for beta_2 is: [-0.23723715197733333,0.33246568810088317]
Pivotal confidence intervals based on bootstrap for beta_3 is: [-0.0499624887754353,2.8640585092514037]


### Section 2 - confidence intervals based on quantiles method

In [99]:
def cal_boot_quantile(B, df, features_cols, labels_cols, alpha):
    num_sample = len(df)
    q1 = 1 - (alpha / 2)
    q2 = alpha / 2
    g_dict = defaultdict(list)
    q_list = [0] * (len(features_cols) + 1)
    for i in range(B):
        new_df = df.sample(num_sample, replace=True)
        X = new_df[features_cols]
        X['const'] = np.ones((num_sample))
        X = X[['const', *features_cols]]
        Y = new_df[labels_cols]
        model = sm.OLS(Y, X)
        model_res = model.fit()
        for i in range(len(features_cols) + 1):
            g_dict[i].append(model_res.params[i])
    for i in range(len(features_cols) + 1):
        q_list[i] = (np.quantile(g_dict[i], q2), np.quantile(g_dict[i], q1))
    return q_list

In [100]:
confidence_intervals = cal_boot_quantile(400, pd.concat([df_train, y_train], axis=1),
                        features_cols=['cigsPerDay', 'totChol', 'diaBP'], labels_cols=['sysBP'], alpha=0.05)

In [103]:
for ci in confidence_intervals:
    print(f"Quantiles confidence intervals based on bootstrap for beta_{i} is: [{ci[0]},{ci[1]}]")

Quantiles confidence intervals based on bootstrap for beta_3 is: [-28.044054623349616,9.120851869597175]
Quantiles confidence intervals based on bootstrap for beta_3 is: [-0.16041141204810144,0.1417458305429191]
Quantiles confidence intervals based on bootstrap for beta_3 is: [0.01591730129880664,0.09482804535626096]
Quantiles confidence intervals based on bootstrap for beta_3 is: [1.3396107610635424,1.7668155629224438]


### Comparing the confidence intervals

**Confidence intervals for $\beta_1$**

In [None]:
Confidence interval for beta_0 is: [-20.120873608415245,6.530840172599772]
Confidence interval for beta_1 is: [-0.12499805793941654,0.12014199808401821]
Confidence interval for beta_2 is: [0.024470264676292097,0.08700841545378751]
Confidence interval for beta_3 is: [1.3755587622140237,1.6662906536246527]

In [None]:
Confidence interval based on bootstrap s.e for beta_0 is: [-23.24972522909058,9.659691793275108]
Confidence interval based on bootstrap s.e for beta_1 is: [-0.12157323646605833,0.11671717661066]
Confidence interval based on bootstrap s.e for beta_2 is: [0.021400344513457188,0.09007833561662243]
Confidence interval based on bootstrap s.e for beta_3 is: [1.3426202113096413,1.6992292045290351]

In [None]:
Pivotal confidence intervals based on bootstrap for beta_0 is: [-159.0955329137899,127.2672041798435]
Pivotal confidence intervals based on bootstrap for beta_1 is: [-1.0780884830760757,1.1102611332927885]
Pivotal confidence intervals based on bootstrap for beta_2 is: [-0.1950850780216041,0.36993707354491834]
Pivotal confidence intervals based on bootstrap for beta_3 is: [-0.017978470252370204,3.0708637474471248]

In [None]:
Quantiles confidence intervals based on bootstrap for beta_3 is: [-28.044054623349616,9.120851869597175]
Quantiles confidence intervals based on bootstrap for beta_3 is: [-0.16041141204810144,0.1417458305429191]
Quantiles confidence intervals based on bootstrap for beta_3 is: [0.01591730129880664,0.09482804535626096]
Quantiles confidence intervals based on bootstrap for beta_3 is: [1.3396107610635424,1.7668155629224438]

### Section 3

#### Section a

In [108]:
new_df = pd.concat([df_test, y_test], axis=1).sample(100)

In [111]:
model_res.predict(df_test)

ValueError: shapes (3959,3) and (4,) not aligned: 3 (dim 1) != 4 (dim 0)