In [2]:
import pandas as pd
import numpy as np

# 그래프 라이브러리
import matplotlib.pyplot as plt
import seaborn as sns

# 회귀 라이브러리
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# 워닝 무시
import warnings
warnings.filterwarnings('ignore')

# 1. 선형회귀 모델의 이해

In [3]:
df_heights = pd.read_csv('data/heights.csv')
df_heights.shape

(1078, 2)

In [4]:
df_ins = pd.read_csv('data/insurance.csv')
df_ins.shape

(1338, 7)

### 1-1. 단순 선형회귀 적합(fit)

In [5]:
# 아들키를 아빠키로 설명하는 선형회귀 모형 적합(fit, 데이터 학습)

In [6]:
model = LinearRegression(fit_intercept=True)  # intercept : 절편

In [7]:
# 선형회귀 모형에 데이터를 주고 적합(fit) 시키기
model.fit(df_heights[ ['father'] ], df_heights['son'])

In [8]:
# 회귀계수와 절편 확인
print(model.intercept_)  # 절편
print(model.coef_)  # 회귀계수

86.071975059358
[0.51409304]


In [13]:
# y = 0.51*X + 86.07

# 모델 예측 -> 아빠키가 173이라면 예상되는 아들키는?
X_test = 173
model.predict([[X_test]])

array([175.01007074])

### 1.2 다중 선형 회귀 모형의 적합(fit)

In [10]:
# X : 설명변수/독립변수 ,   y: 관심변수/종속변수
X = df_ins[ ['age', 'bmi', 'children'] ]
y = df_ins['charges']

In [85]:
model = LinearRegression(fit_intercept=True)

In [86]:
# 선형회귀 모형 적합(fit)
model.fit(X, y)

In [87]:
print(model.intercept_)  # 절편
print(model.coef_)      # 회귀계수

-6916.243347787044
[239.99447429 332.0833645  542.86465225]


# 2. 변수 선택과 모델 성능

### 2-1. 범주형 변수의 수치화

In [88]:
# 범주형 변수(sex, smoker, region)들도 회귀 모형에 추가하기

In [89]:
# pd.get_dummies() 함수 활용
df_dummies = pd.get_dummies(data=df_ins, drop_first=True)
df_dummies.head(1)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1


In [90]:
# 독립변수 선언
X = df_dummies.drop(columns=['charges'])
y = df_dummies['charges']

In [91]:
# 선형회귀 모형 적합(fit)
linear_model = LinearRegression(fit_intercept=True)
linear_model.fit(X, y)

In [92]:
print(linear_model.intercept_)
print(linear_model.coef_)

-11938.538576167175
[  256.85635254   339.19345361   475.50054515  -131.3143594
 23848.53454191  -352.96389942 -1035.02204939  -960.0509913 ]


In [93]:
# R2 score(설명력) : 값이 1에 가까울수록 모형이 데이터를 잘 설명한다고 볼 수 있음
from sklearn.metrics import r2_score

y_pred = linear_model.predict(X)  # 모델 예측값
r2_score(y, y_pred)

0.7509130345985207

### 2-2. 선형회귀 모델의 R2 score와 회귀계수

In [94]:
import statsmodels.api as sm
# 상수항 추가 => 모델 성능이 좋아짐.
X_sm = sm.add_constant(X)
X_sm

Unnamed: 0,const,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,1.0,19,27.900,0,0,1,0,0,1
1,1.0,18,33.770,1,1,0,0,1,0
2,1.0,28,33.000,3,1,0,0,1,0
3,1.0,33,22.705,0,1,0,1,0,0
4,1.0,32,28.880,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,1.0,50,30.970,3,1,0,1,0,0
1334,1.0,18,31.920,0,0,0,0,0,0
1335,1.0,18,36.850,0,0,0,0,1,0
1336,1.0,21,25.800,0,0,0,0,0,1


In [95]:
# 선형회귀 모델 적합(fit)
ls = sm.OLS(y, X_sm).fit()
ls.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.749
Method:,Least Squares,F-statistic:,500.8
Date:,"Sat, 08 Jul 2023",Prob (F-statistic):,0.0
Time:,17:36:18,Log-Likelihood:,-13548.0
No. Observations:,1338,AIC:,27110.0
Df Residuals:,1329,BIC:,27160.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.194e+04,987.819,-12.086,0.000,-1.39e+04,-1e+04
age,256.8564,11.899,21.587,0.000,233.514,280.199
bmi,339.1935,28.599,11.860,0.000,283.088,395.298
children,475.5005,137.804,3.451,0.001,205.163,745.838
sex_male,-131.3144,332.945,-0.394,0.693,-784.470,521.842
smoker_yes,2.385e+04,413.153,57.723,0.000,2.3e+04,2.47e+04
region_northwest,-352.9639,476.276,-0.741,0.459,-1287.298,581.370
region_southeast,-1035.0220,478.692,-2.162,0.031,-1974.097,-95.947
region_southwest,-960.0510,477.933,-2.009,0.045,-1897.636,-22.466

0,1,2,3
Omnibus:,300.366,Durbin-Watson:,2.088
Prob(Omnibus):,0.0,Jarque-Bera (JB):,718.887
Skew:,1.211,Prob(JB):,7.860000000000001e-157
Kurtosis:,5.651,Cond. No.,311.0
