# 1. The Validation Set Approach

In [1]:
import numpy as np
import pandas as pd

pd.set_option('precision',2)
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:20.2f}'.format) #e같은 애들 안 나오게 해줌

In [2]:
#data 불러오기

auto=pd.read_csv('../datasets/Auto.csv', na_values=['?'])
#? 표시인 애들이 na값이다.

auto.dropna(axis=0, inplace=True)
#na 처리

auto.cylinders = auto.cylinders.astype('category')
auto.name = auto.name.astype('category')
#카테고리 변수 카테고리화

auto.reset_index(inplace=True)
#inplace 하면 원본 객체가 변경됨.

#polynomial 변수 생성
auto['horsepower_2']=np.power(auto.horsepower, 2)
auto['horsepower_3']=np.power(auto.horsepower, 3)
auto['horsepower_4']=np.power(auto.horsepower, 4)
auto['horsepower_5']=np.power(auto.horsepower, 5)

#sklearn을 이용한 polynomial feature 설정
from sklearn.preprocessing import PolynomialFeatures
pol = PolynomialFeatures(degree = 5, interaction_only = False, include_bias=False)
polf = pol.fit_transform(auto.loc[:,'horsepower'].values.reshape(-1,1)) 
#이렇게 하면 polf는 1열이 horsepower1차 2열이 2차 ... 5열이 5차인 매트릭스 형태의 array로 만들어짐.
#-1이 의미하는 거는 만약 열이 1이라고 했을 때 행은 자동으로 추정되도록 하는 것.
#그냥 auto.loc[]으로 하면 얘는 index를 포함하고 있기 때문에 error가 뜬다. 따라서 values만 따로 뽑아주기 위해서 values.reshape 시행


In [3]:
#Train Set과 Validation Set으로 나누기
#Test Set으로 되어있지만 사실상 Validation Set

from sklearn.model_selection import train_test_split

X, y = auto.loc[:, ['horsepower', 'horsepower_2', 'horsepower_3']], auto.mpg
#horsepower 1차 2차 3차 항을 X변수, mpg를 종속변수로 둠

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=42)

In [4]:
#Linear Model 만들어서 적용

from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as mse

#ols model with intercept
lm1 = LR(fit_intercept = True)
lm2 = LR(fit_intercept = True)
lm3 = LR(fit_intercept = True)

lm1_fit = lm1.fit(X_train.loc[:, 'horsepower'].values.reshape(-1,1), y_train)
lm2_fit = lm2.fit(X_train.loc[:,['horsepower', 'horsepower_2']], y_train)
lm3_fit = lm3.fit(X_train.loc[:,['horsepower', 'horsepower_2','horsepower_3']], y_train)
#1차일때만 reshape해주면 됨. 2차 이상인 경우 열이 2개 이상이여서 자동으로 dataframe으로 바껴서 reshape 필요 없는듯.

lm1_predict = lm1_fit.predict(X_test.loc[:, 'horsepower'].values.reshape(-1,1))
lm2_predict = lm2_fit.predict(X_test.loc[:, ['horsepower','horsepower_2']])
lm3_predict = lm3_fit.predict(X_test.loc[:, ['horsepower','horsepower_2','horsepower_3']])

print('lm1 MSE:', mse(y_test, lm1_predict))
print('lm2 MSE:', mse(y_test, lm2_predict))
print('lm3 MSE:', mse(y_test, lm3_predict))

#이렇게 나온 값들은 진짜 test mse의 추정값들. 
#2차인 애의 mse가 제일 작으므로 2차 polynomial로 fitting

lm1 MSE: 25.573878189684407
lm2 MSE: 22.218020050032855
lm3 MSE: 22.66767543553442


# 2. Leave-One-Out Cross Validation

In [5]:
from sklearn.model_selection import LeaveOneOut as LOOCV

X, y = auto.loc[:,['horsepower', 'horsepower_2', 'horsepower_3','horsepower_4','horsepower_5']], auto.mpg

loocv = LOOCV()
loocv.get_n_splits(X)

#몇 개로 나눠지는 지 알려줌

392

In [7]:
#1차 poly만 볼때

loocv_mse=[]
lm = LR(fit_intercept=True)

for train_index, test_index in loocv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    lm1_fit = lm.fit(X_train.loc[:, 'horsepower'].values.reshape(-1,1), y_train)
    lm1_predict = lm1_fit.predict(X_test.loc[:, 'horsepower'].values.reshape(-1,1))
    
    loocv_mse.append(mse(y_test, lm1_predict))
    
np.array(loocv_mse).mean()

24.231513517929226

In [12]:
#sklearn을 이용하면 CV를 더 쉽게 할 수 있다.

from sklearn.model_selection import cross_val_score as cvs

lm = LR(fit_intercept=True)

cval = cvs(lm,
           auto.loc[:, 'horsepower'].values.reshape(-1,1),
           auto.mpg,
           cv = len(auto), #k=n인 k-fold cv가 곧 LOOCV이므로
           n_jobs=-1,
           scoring='neg_mean_squared_error')

-cval.mean()
#우리의 방법이 scoring이 neg_mean_squred_error였기 때문에 값을 정확히 알려면 -를 붙여주어야 함.


24.23151351792923

In [14]:
# 이제 1차부터 5차까지 한번에 cv_mse를 구할 수 있는 loop를 만들어보자.

loocv_poly={}

for i in range(1,6):
    
    loocv_mse=[]
    
    for train_index, test_index in loocv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if i == 1:
            X_train = X_train.iloc[:, 0:1].values.reshape(-1,1)
            X_test = X_test.iloc[:, 0:1].values.reshape(-1,1)
        
        else:
            X_train = X_train.iloc[:, 0:i]
            X_test = X_test.iloc[:, 0:i]
        
        MSE = mse(y_test, LR(fit_intercept=True).fit(X_train, y_train).predict(X_test))
    
        loocv_mse.append(MSE)
        
        loocv_poly['lm'+str(i)+'_MSE']= np.array(loocv_mse).mean()

In [15]:
loocv_poly

{'lm1_MSE': 24.231513517929226,
 'lm2_MSE': 19.248213124489677,
 'lm3_MSE': 19.334984064029175,
 'lm4_MSE': 19.424430310525526,
 'lm5_MSE': 19.033212804000605}

# 3. K-Fold Cross-Validation

In [16]:
from sklearn.model_selection import KFold

X, y = auto.loc[:,['horsepower', 'horsepower_2', 'horsepower_3','horsepower_4','horsepower_5']], auto.mpg

kf = KFold(n_splits=10, shuffle = True, random_state=42)
#10개로 쪼개고 셔플할 것임

In [18]:
kf_poly = {}

for i in range(1,6):
    
    kf_mse=[]
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if i == 1:
            X_train = X_train.iloc[:,0:1].values.reshape(-1,1)
            X_test = X_test.iloc[:,0:1].values.reshape(-1,1)
        else:
            X_train = X_train.iloc[:,0:i]
            X_test = X_test.iloc[:,0:i]
        
        MSE = mse(y_test, LR(fit_intercept=True).fit(X_train, y_train).predict(X_test))
        kf_mse.append(MSE)
        
        kf_poly['lm'+str(i)+'_MSE']=np.array(kf_mse).mean()

kf_poly

{'lm1_MSE': 24.199808197692477,
 'lm2_MSE': 19.22863661426802,
 'lm3_MSE': 19.266265346631833,
 'lm4_MSE': 19.351092272961104,
 'lm5_MSE': 19.02323300932039}

# 4. The Bootstrap

In [22]:
pd.read_csv('../datasets/Portfolio.csv')

Unnamed: 0.1,Unnamed: 0,X,Y
0,1,-0.90,-0.23
1,2,-1.56,-0.89
2,3,-0.42,0.27
3,4,1.04,-0.73
4,5,-0.32,0.84
...,...,...,...
95,96,0.48,1.45
96,97,-0.54,-0.40
97,98,-0.77,-0.96
98,99,0.40,1.40


In [20]:
portfolio = pd.read_csv('../datasets/Portfolio.csv', index_col=0)
#index column이 이미 있으므로 1번째 열이 index column 임을 알려준다.

In [21]:
portfolio

Unnamed: 0,X,Y
1,-0.90,-0.23
2,-1.56,-0.89
3,-0.42,0.27
4,1.04,-0.73
5,-0.32,0.84
...,...,...
96,0.48,1.45
97,-0.54,-0.40
98,-0.77,-0.96
99,0.40,1.40


In [26]:
def alpha_fn(data, start_index, end_index):
    X = data['X'][start_index:end_index]
    Y = data['Y'][start_index:end_index]
    return ((np.var(Y) - np.cov(X,Y)[0][1]) / (np.var(X) + np.var(Y) - 2*np.cov(X,Y)[0][1]))

In [27]:
#X, Y가 전체집단이라고 할 경우에 모수

alpha_fn(portfolio, 0, 100)

0.5766511516104118

In [31]:
#Python에서 Bootstrap은 sklearn 안에 있는 resample을 이용한다. (bootstrap 기능 자체는 없음)

from sklearn.utils import resample

portfolio_bs = resample(portfolio, replace = True, n_samples = 100)

alpha_fn(portfolio_bs, 0, 100)
#portfolio 데이터에서 랜덤으로 100개를 중복으로 뽑아서 데이터를 새로 구한 뒤의 alpha값

0.5517469372674121

In [33]:
bs_alpha =[]

for i in range(0,1000):
    alpha = alpha_fn(resample(portfolio, replace=True, n_samples=100), 0, 100)
    bs_alpha.append(alpha)

bs_alpha = np.array(bs_alpha)

print('Bootstrapped alpha hat:', bs_alpha.mean()) #Bootstrap으로 구한 alpha hat
print('SE:', bs_alpha.std())

Bootstrapped alpha hat: 0.5799715788046581
SE: 0.09371490124310197


In [37]:
def boot_fn(data, start_index, end_index):
    m = LR(fit_intercept=True).fit(
         data['horsepower'][start_index:end_index].values.reshape(-1,1),
         data['mpg'][start_index:end_index]
    )
    
    return m.intercept_, m.coef_[0]

boot_fn(auto, 0, 392)
#전체 데이터를 사용했을 때 OLS intercept값과 horsepower intercept 값

(39.93586102117047, -0.15784473335365365)

In [38]:
boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)
#auto 데이터에서 392개를 resampling해서 새로운 데이터를 fitting해서 구한 베타 값들

(39.78371083147526, -0.15574205332468838)

In [52]:
bs_boot = {'intercept':[], 'beta 1':[]}

for i in range(0,1000):
    bs_boot['intercept'].append(
        boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)[0]
    )
    bs_boot['beta 1'].append(
        boot_fn(resample(auto, replace=True, n_samples=392), 0, 392)[1]
    )

intercept_hat = np.array(bs_boot['intercept']).mean()
intercept_se = np.array(bs_boot['intercept']).std()
b1_hat = np.array(bs_boot['beta 1']).mean()
b1_se = np.array(bs_boot['beta 1']).std()

print('intercept bs hat & se:', intercept_hat, intercept_se)
print('b1 bs hat & se:', b1_hat, b1_se)
#위에서 boot_fn(auto, 0, 392)를 사용했을 때와 거의 차이가 없다는 것을 알 수 있다.

intercept bs hat & se: 39.962596747215926 0.8472821945395944
b1 bs hat & se: -0.15826843959184062 0.00747410123974911


In [57]:
# horsepower 2 변수가 추가되는 경우

def boot_fn2(data, start_index, end_index):
    m = LR(fit_intercept=True).fit(
        data[['horsepower', 'horsepower_2']][start_index:end_index],
        data['mpg'][start_index:end_index]
    )
    
    return m.intercept_, m.coef_[0], m.coef_[1]

In [58]:
bs_boot2 = {'intercept':[], 'beta 1':[], 'beta 2':[]}

for i in range(0,1000):
    bs_boot2['intercept'].append(
        boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[0]
    )
    bs_boot2['beta 1'].append(
        boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[1]
    )
    bs_boot2['beta 2'].append(
        boot_fn2(resample(auto, replace=True, n_samples=392), 0, 392)[2]
    )

intercept_hat = np.array(bs_boot2['intercept']).mean()
intercept_se = np.array(bs_boot2['intercept']).std()
b1_hat = np.array(bs_boot2['beta 1']).mean()
b1_se = np.array(bs_boot2['beta 1']).std()
b2_hat = np.array(bs_boot2['beta 2']).mean()
b2_se = np.array(bs_boot2['beta 2']).std()

print('intercept bs hat & se:', intercept_hat, intercept_se)
print('beta 1 bs hat & se:', b1_hat, b1_se)
print('beta 2 bs hat & se:', b2_hat, b2_se)

intercept bs hat & se: 56.93246800779829 2.0133303879344746
beta 1 bs hat & se: -0.46704350129262445 0.03362387137249098
beta 2 bs hat & se: 0.0012355562744599086 0.0001184440097654577


In [59]:
import statsmodels.formula.api as sm

ols2 = sm.ols('mpg ~ horsepower + horsepower_2', data =auto).fit()
ols2.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,56.9001,1.800,31.604,0.000,53.360,60.440
horsepower,-0.4662,0.031,-14.978,0.000,-0.527,-0.405
horsepower_2,0.0012,0.000,10.080,0.000,0.001,0.001
