#  선형회귀 모델 작성, 예측 , 평가

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/premium.csv')
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df['charges'].describe()

count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1333 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
# 결측치 대체하기
df['bmi'] = df['bmi'].fillna(
    df.groupby('sex')['bmi'].transform('mean')
)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
# 데이터 분리하기
X1 = df.drop(columns = 'charges')
y = df['charges']

In [8]:
# 데이터 전처리
from sklearn.preprocessing import LabelEncoder
scaler = LabelEncoder()
X1['sex_scaler'] = scaler.fit_transform(X1['sex'])
X1['smoker_scaler'] = scaler.fit_transform(X1['smoker'])
X1['region_scaler'] = scaler.fit_transform(X1['region'])

In [9]:
X = X1.drop(columns=['sex','region','smoker'])

In [10]:
X['region_scaler'].value_counts()

region_scaler
2    364
3    325
1    325
0    324
Name: count, dtype: int64

In [11]:
# 데이터 분할하기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3
                                                    ,random_state=42)

In [12]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

from sklearn.metrics import mean_squared_error,r2_score
import numpy as np

# 평가
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse) # 실제 오차의 크기
print(rmse, mse)

from sklearn.model_selection import cross_val_score # 교차검증
neg_mse_scores = cross_val_score(lr, X, y, scoring='neg_mean_squared_error', cv=5)
print(-neg_mse_scores.mean())

5813.161660256295 33792848.48827373
37013814.60147809


In [13]:
from sklearn.ensemble import RandomForestRegressor
Forest = RandomForestRegressor()
Forest.fit(X_train, y_train)
y_pred  = Forest.predict(X_test)

from sklearn.metrics import mean_squared_error
Forest_mse = mean_squared_error(y_test, y_pred)
Forest_rmse = np.sqrt(Forest_mse)
print(Forest_rmse,Forest_mse)

from sklearn.model_selection import cross_val_score # 교차검증
neg_mse_scores = cross_val_score(Forest, X, y, scoring='neg_mean_squared_error', cv=5)
print(-neg_mse_scores.mean())

4605.579680861358 21211364.196763
24452986.23224957


In [14]:
# R2: 분산기반의 예측성능, 1에 가까울수록 예측 정확도가 높다.(결정게수)
r2_score = cross_val_score(lr, X,y, scoring = 'r2',cv = 5)
r2_score , np.mean(r2_score)

(array([0.7596674 , 0.70737562, 0.77529175, 0.73351251, 0.75524776]),
 np.float64(0.7462190093290715))

In [15]:
# R2: 분산기반의 예측성능, 1에 가까울수록 예측 정확도가 높다.(결정게수)
r2_score = cross_val_score(Forest, X,y, scoring = 'r2',cv = 5)
r2_score , np.mean(r2_score)

(array([0.84815306, 0.77211988, 0.85651434, 0.83251564, 0.847426  ]),
 np.float64(0.8313457825033055))

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

results = []

for  degree in range(1,5):
    model_poly = Pipeline([
        ("poly", PolynomialFeatures(degree = degree, include_bias=False)),
        # poly: 기존 입력 데이터를 다항식 형태로 확장해주는 전처리 도구
        ("linear", LinearRegression())
    ])

    model_poly.fit(X_train, y_train)
    pred_poly = model_poly.predict(X_test)
    mse = mean_squared_error(y_test,pred_poly)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred_poly)
    
    results.append({'degree':degree,
                'MSE':mse,
                'RMSE':rmse,
                'r2':r2})
pd.DataFrame(results)

Unnamed: 0,degree,MSE,RMSE,r2
0,1,33792850.0,5813.16166,0.769528
1,2,20173700.0,4491.513766,0.862412
2,3,21294300.0,4614.575082,0.85477
3,4,32377370.0,5690.111448,0.779181


💡 수학과 머신러닝의 다른 점

의문. 현재 위의 모델의 차수가 늘어난 상태(비선형) 근데 선형모델인 Linear를 쓴 이유?

1️⃣ 수학적으로는
다항회귀 함수는 
![image.png](attachment:image.png)
x에 대해 비선형 함수(곡선) 맞아요.

2️⃣ 머신러닝 관점에서는
모델 파라미터인 가중치 ![image-2.png](attachment:image-2.png) 들에 대해서는 선형 결합 구조이기 때문에

‘선형 모델’ 범주에 포함합니다.

3️⃣ 그래서
다항회귀는 입력 변수 기준으로는 비선형 함수

하지만 가중치 기준으로는 선형 모델

따라서, ‘곡선 형태’이면서 ‘선형 모델’입니다!

즉, 다항회귀는 수학적으로 비선형 곡선이 맞고, 머신러닝에서도 비선형 곡선이 맞으나 가중치를 기준으로 선형이라 선형 모델 범주에 포함된다.

In [17]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
ridge = Ridge(alpha= 1.0)
ridge.fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, pred_ridge)
r2  = r2_score(y_test, pred_ridge)
mse, r2

ridge = Lasso(alpha= 1.0)
ridge.fit(X_train, y_train)
pred_lasso = ridge.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, pred_ridge)
r2  = r2_score(y_test, pred_ridge)
mse, r2

enet = ElasticNet(alpha= 0.1, l1_ratio= 0.5)
enet.fit(X_train, y_train)

enet_pred = enet.predict(X_test)
print(mean_squared_error(y_test, enet_pred))
print(r2_score(y_test, enet_pred))


39661505.72502192
0.7295025204575183


In [18]:
results = pd.DataFrame({
    '모델': ['다항회귀', '릿지회귀', '라쏘회귀', '엘라스틱넷회귀'],
    'MSE': [mean_squared_error(y_test, pred_poly),
            mean_squared_error(y_test, pred_ridge),
            mean_squared_error(y_test, pred_lasso),
            mean_squared_error(y_test, enet_pred)
    ],
    'R2': [r2_score(y_test, pred_poly),
           r2_score(y_test, pred_ridge),
           r2_score(y_test, pred_lasso),
           r2_score(y_test, enet_pred),]
})

results

Unnamed: 0,모델,MSE,R2
0,다항회귀,32377370.0,0.779181
1,릿지회귀,33826530.0,0.769298
2,라쏘회귀,33792920.0,0.769527
3,엘라스틱넷회귀,39661510.0,0.729503
