# 선형회귀 이해하기.

In [None]:
'''
선형회귀 part1 - 직접 OLS 구현하기.

sklearn의 datasets에서 make_regression 함수를 사용하면 선형회귀용 데이터셋을 만들 수 있다.
인자로 n_samples, n_features, bias, noise, coef, random_state를 필요로 한다.
리턴값은 X0, y, w이다. y는 1차원 배열인데 열백터로 reshape 해야 한다.
'''

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import make_regression
from statsmodels.api import add_constant

In [3]:
X0, y, w = make_regression(n_samples=100, n_features=5, bias=100, noise=10, coef=True, random_state=True)

In [12]:
# y를 열벡터로 변형
y = y.reshape(len(y), 1)

In [15]:
# 상수항 결합
X = add_constant(X0)

In [19]:
# OLS를 사용해서 w벡터 직접 구하기.
w = np.linalg.inv(X.T @ X) @ X.T @ y

In [20]:
# 가중치 결과
w

array([[100.97198982],
       [  8.23155766],
       [ 51.62669014],
       [ 35.68489779],
       [ 26.46372731],
       [ 22.49300908]])

In [24]:
# 예측값 벡터 구하기.
y_ = X @ w

In [32]:
# 정답 벡터
y[:5]

array([[ 132.23471426],
       [ 156.99996383],
       [ 209.50432051],
       [  47.77297442],
       [-161.04915117]])

In [33]:
# 예측 벡터
y_[:5]

array([[ 130.34130807],
       [ 181.32511203],
       [ 206.9112515 ],
       [  51.19991119],
       [-171.7617931 ]])

In [27]:
# 잔체 벡터
e = y - y_

In [29]:
# RSS 구하기
RSS = e.T @ e

In [30]:
RSS

array([[8609.81952755]])

In [None]:
'''
선형회귀 part2 - sklearn의 LinearRegression 사용하기.

sklearn의 linear_model에 있는 LinearRegression 클래스를 사용하면 보다 쉽게 선형회귀식을 얻을 수 있다.
'''

In [35]:
from sklearn.linear_model import LinearRegression

In [36]:
# 선형회귀 모델 생성.
# 만약 사용할 데이터에 상수결합이 이미 되어 있다면 fit_intercept를 False로 전달해야 한다.
model = LinearRegression()

In [38]:
# fit 함수를 사용하면 전달된 X와 y에 대한 선형회귀 모델을 리턴한다.
model = model.fit(X0, y)

In [39]:
# coef_는 가중치 벡터, intercept_는 상수항을 의미한다.
model.coef_, model.intercept_

(array([[ 8.23155766, 51.62669014, 35.68489779, 26.46372731, 22.49300908]]),
 array([100.97198982]))

In [46]:
# 테스트 데이터.
X_test = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])

In [47]:
# 테스트 데이터에 대한 예측 결과.
model.predict(X_test)

array([[ 537.83157578],
       [1260.33098567]])

In [None]:
'''
선형회귀 part3 - statsmodels 패키지 사용하기.

statsmodels의 OLS를 사용한다.
OLS의 인자로 y와 X를 데이터프레임 형식으로 전달해주면 모델이 반환된다. 이 때 X는 상수항 결합이 완료되어야 한다.
model의 fit 함수를 사용하면 RegressionResults 객체가 반환된다.
RegressionResults 객체의 predict 함수를 사용하면 예측 결과를 얻을 수 있고 summary 함수를 사용하면 선형회귀의 결과 요약을 얻을 수 있다.
'''

In [77]:
import statsmodels.api as sm

In [78]:
X = pd.DataFrame(data=X0, columns=['f1', 'f2', 'f3', 'f4', 'f5'])
X = sm.add_constant(X)

In [79]:
y = pd.DataFrame(data=y, columns=['y'])

In [85]:
model = sm.OLS(y, X)

In [86]:
rlt = model.fit()

In [87]:
# 선형회귀 결과 요약.
rlt.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.983
Method:,Least Squares,F-statistic:,1140.0
Date:,"Sat, 04 Sep 2021",Prob (F-statistic):,1.8299999999999998e-82
Time:,17:45:01,Log-Likelihood:,-364.67
No. Observations:,100,AIC:,741.3
Df Residuals:,94,BIC:,757.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,100.9720,0.987,102.337,0.000,99.013,102.931
f1,8.2316,0.956,8.610,0.000,6.333,10.130
f2,51.6267,0.942,54.801,0.000,49.756,53.497
f3,35.6849,0.991,36.014,0.000,33.718,37.652
f4,26.4637,1.036,25.542,0.000,24.407,28.521
f5,22.4930,1.067,21.087,0.000,20.375,24.611

0,1,2,3
Omnibus:,1.74,Durbin-Watson:,1.683
Prob(Omnibus):,0.419,Jarque-Bera (JB):,1.492
Skew:,-0.15,Prob(JB):,0.474
Kurtosis:,2.482,Cond. No.,1.48


In [90]:
# 테스트 데이터에 대한 예측.
rlt.predict([[1, 1, 2, 3, 4, 5], [1, 6, 7, 8, 9, 10]])

array([ 537.83157578, 1260.33098567])

In [91]:
# 가중치 벡터.
rlt.params

const    100.971990
f1         8.231558
f2        51.626690
f3        35.684898
f4        26.463727
f5        22.493009
dtype: float64

In [93]:
# 잔차 벡터.
rlt.resid

0      1.893406
1    -24.325148
2      2.593069
3     -3.426937
4     10.712642
        ...    
95     6.912101
96    -4.243664
97    -2.649401
98     1.568349
99    -5.582025
Length: 100, dtype: float64

In [95]:
# 잔차의 평균은 0 (거의 근사)
rlt.resid.mean()

-1.4420464822251234e-13

In [None]:
# X의 평균에 대한 예측값은 y의 평균과 같다.

In [109]:
rlt.predict(X.mean().values)

array([110.3267573])

In [110]:
y.mean()

y    110.326757
dtype: float64

In [None]:
'''
선형회귀 part3 - 보스턴 집값 예측
'''

In [None]:
'''
1번 방법: 직접 OLS 구현하기.
'''

In [111]:
from sklearn.datasets import load_boston
boston = load_boston()

In [116]:
X0, y = boston.data, boston.target.reshape(-1,1)
X = sm.add_constant(X0)

In [119]:
w = np.linalg.inv(X.T @ X) @ X.T @ y

In [120]:
w

array([[ 3.64594884e+01],
       [-1.08011358e-01],
       [ 4.64204584e-02],
       [ 2.05586264e-02],
       [ 2.68673382e+00],
       [-1.77666112e+01],
       [ 3.80986521e+00],
       [ 6.92224640e-04],
       [-1.47556685e+00],
       [ 3.06049479e-01],
       [-1.23345939e-02],
       [-9.52747232e-01],
       [ 9.31168327e-03],
       [-5.24758378e-01]])

In [122]:
e = y - X @ w

In [123]:
RSS = e.T @ e

In [124]:
RSS

array([[11078.78457795]])

In [129]:
test = [[1] + list(range(13)), [1] + list(range(13, 26))]

In [131]:
pred = test @ w
pred

array([[ -31.11946575],
       [-212.60464153]])

In [None]:
'''
2번 방법: sklearn의 LinearRegression 사용하기.
'''

In [135]:
from sklearn.linear_model import LinearRegression

In [136]:
model = LinearRegression()

In [138]:
model = model.fit(X0, y)

In [139]:
model.coef_, model.intercept_

(array([[-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,
          2.68673382e+00, -1.77666112e+01,  3.80986521e+00,
          6.92224640e-04, -1.47556685e+00,  3.06049479e-01,
         -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
         -5.24758378e-01]]),
 array([36.45948839]))

In [141]:
test = [list(range(13)), list(range(13, 26))]

In [142]:
model.predict(test)

array([[ -31.11946574],
       [-212.60464153]])

In [None]:
'''
방법 3: statsmodels.api의 OLS 사용하기.
'''

In [143]:
import statsmodels.api as sm

In [146]:
X = pd.DataFrame(data=X0)
X = sm.add_constant(X)

In [148]:
y = pd.DataFrame(data=y)

In [150]:
model = sm.OLS(y, X)

In [151]:
rlt = model.fit()

In [152]:
rlt.summary()

0,1,2,3
Dep. Variable:,0,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Sat, 04 Sep 2021",Prob (F-statistic):,6.72e-135
Time:,18:12:05,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
0,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
1,0.0464,0.014,3.382,0.001,0.019,0.073
2,0.0206,0.061,0.334,0.738,-0.100,0.141
3,2.6867,0.862,3.118,0.002,0.994,4.380
4,-17.7666,3.820,-4.651,0.000,-25.272,-10.262
5,3.8099,0.418,9.116,0.000,2.989,4.631
6,0.0007,0.013,0.052,0.958,-0.025,0.027
7,-1.4756,0.199,-7.398,0.000,-1.867,-1.084

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


In [153]:
test = [[1] + list(range(13)), [1] + list(range(13, 26))]

In [154]:
rlt.predict(test)

array([ -31.11946574, -212.60464153])