In [1]:
# 필요한 모듈 임포트 및 필요한 설정
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn  # scikit-learn 모듈 임포트
from sklearn.linear_model import Ridge #Ridge 회귀 사용
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler # 특성 스케일링: Min-Mas 스케일링 , 정규화
from sklearn.preprocessing import StandardScaler # 특성 스케일링: 표준화
from sklearn.model_selection import train_test_split # 테스트 집합 분리
from sklearn.preprocessing import PolynomialFeatures


# 랜덤하게 실행되는 부분들에 대해 다음에 실행해도 동일한 결과를 얻기 위해 설정
seed = 11
rng = np.random.default_rng(seed)

In [2]:
df = pd.read_csv('bodyfat.csv', sep=',')  # 체지방률 데이터셋

In [3]:
df.info() # 체지방률 데이터셋에 대한 전반적인 정보 나타냄

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Density  252 non-null    float64
 1   Wrist    252 non-null    float64
 2   Age      252 non-null    int64  
 3   Weight   252 non-null    float64
 4   Height   252 non-null    float64
 5   Neck     252 non-null    float64
 6   Chest    252 non-null    float64
 7   Abdomen  252 non-null    float64
 8   Hip      252 non-null    float64
 9   Thigh    252 non-null    float64
 10  Knee     252 non-null    float64
 11  Ankle    252 non-null    float64
 12  Biceps   252 non-null    float64
 13  Forearm  252 non-null    float64
 14  BodyFat  252 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 29.7 KB


##### 각 특성 별 의미 (총 15개, 모두 수치형) 
Density   : 신체 밀도 측정 값 
Wrist    : 손목 둘레
Age      : 나이
Weight   : 몸무게
Height   : 키
Neck     : 목 둘레
Chest    : 가슴 둘레
Abdomen  : 복부 둘레
Hip      : 엉덩이 둘레
Thigh    : 허벅지 둘레
Knee     : 무릎 둘레
Ankle    : 발목 둘레
Biceps   : 이두 둘레
Forearm  : 전완 둘레
BodyFat  : 체지방 률 (예측값)



In [4]:
df.describe()  # 체지방률 데이터셋에 대한 요약 통게량을 나타냄, 키 처럼 표준 편차가 큰 데이터 들이 존재 -> 정규화나 표준화가 필요

Unnamed: 0,Density,Wrist,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,BodyFat
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,1.055574,18.229762,44.884921,178.924405,70.14881,37.992063,100.824206,92.555952,99.904762,59.405952,38.590476,23.102381,32.273413,28.663889,19.150794
std,0.019031,0.933585,12.60204,29.38916,3.662856,2.430913,8.430476,10.783077,7.164058,5.249952,2.411805,1.694893,3.021274,2.020691,8.36874
min,0.995,15.8,22.0,118.5,29.5,31.1,79.3,69.4,85.0,47.2,33.0,19.1,24.8,21.0,0.0
25%,1.0414,17.6,35.75,159.0,68.25,36.4,94.35,84.575,95.5,56.0,36.975,22.0,30.2,27.3,12.475
50%,1.0549,18.3,43.0,176.5,70.0,38.0,99.65,90.95,99.3,59.0,38.5,22.8,32.05,28.7,19.2
75%,1.0704,18.8,54.0,197.0,72.25,39.425,105.375,99.325,103.525,62.35,39.925,24.0,34.325,30.0,25.3
max,1.1089,21.4,81.0,363.15,77.75,51.2,136.2,148.1,147.7,87.3,49.1,33.9,45.0,34.9,47.5


In [5]:
df.head() # 체지방률 데이터셋의 대한 상위 5행을 나타냄

Unnamed: 0,Density,Wrist,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,BodyFat
0,1.0708,17.1,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,12.3
1,1.0853,18.2,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,6.1
2,1.0414,16.6,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,25.3
3,1.0751,18.2,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,10.4
4,1.034,17.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,28.7


In [6]:
X = df.loc[:, 'Density':'Forearm'] #X에 Density 부터 Forearm 값을 넣음
X # X에 잘 들어갔는지 확인 하려 출력

Unnamed: 0,Density,Wrist,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm
0,1.0708,17.1,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4
1,1.0853,18.2,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9
2,1.0414,16.6,22,154.00,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2
3,1.0751,18.2,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4
4,1.0340,17.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,1.0736,18.5,70,134.25,67.00,34.9,89.2,83.6,88.8,49.6,34.8,21.5,25.6,25.7
248,1.0236,20.1,72,201.00,69.75,40.9,108.5,105.0,104.5,59.6,40.8,23.2,35.2,28.6
249,1.0328,18.0,72,186.75,66.00,38.9,111.1,111.5,101.7,60.3,37.3,21.5,31.3,27.2
250,1.0399,19.8,72,190.75,70.50,38.9,108.3,101.3,97.8,56.0,41.6,22.7,30.5,29.4


In [7]:
y = df['BodyFat'] #목표값 y=체지방률
y

0      12.3
1       6.1
2      25.3
3      10.4
4      28.7
       ... 
247    11.0
248    33.6
249    29.3
250    26.0
251    31.9
Name: BodyFat, Length: 252, dtype: float64

In [8]:
# 정규화
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(X)

minmax_scaled

array([[0.66549605, 0.23214286, 0.01694915, ..., 0.18918919, 0.35643564,
        0.46043165],
       [0.7928007 , 0.42857143, 0.        , ..., 0.29054054, 0.28217822,
        0.56834532],
       [0.40737489, 0.14285714, 0.        , ..., 0.33108108, 0.1980198 ,
        0.30215827],
       ...,
       [0.33187006, 0.39285714, 0.84745763, ..., 0.16216216, 0.32178218,
        0.44604317],
       [0.39420544, 0.71428571, 0.84745763, ..., 0.24324324, 0.28217822,
        0.60431655],
       [0.28182616, 0.91071429, 0.88135593, ..., 0.37162162, 0.44059406,
        0.64748201]])

In [9]:
#표준화
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(X)

standard_scaled

array([[ 0.80164696, -1.2125412 , -1.74007329, ..., -0.71082568,
        -0.09067593, -0.62671828],
       [ 1.56506057, -0.0319426 , -1.81958344, ...,  0.17594695,
        -0.58814329,  0.11707924],
       [-0.74623993, -1.74917692, -1.81958344, ...,  0.530656  ,
        -1.15193963, -1.71762131],
       ...,
       [-1.19902317, -0.24659689,  2.15592399, ..., -0.94729838,
        -0.32282736, -0.72589128],
       [-0.82521375,  1.68529172,  2.15592399, ..., -0.23788028,
        -0.58814329,  0.36501175],
       [-1.49912369,  2.86589032,  2.31494429, ...,  0.88536506,
         0.47312041,  0.66253075]])

In [10]:
# 전체 데이터를 훈련 데이터, 테스트 데이터으로 나눔
X_train, X_test, y_train, y_test = train_test_split(standard_scaled, y, test_size= 0.2, random_state=1234) # 표준화된 데이터셋 중 테스트, 훈련 데이터 무작위 분리 ,random_state : 난수 시드
print('X 훈련 데이터 :')
print(X_train)
print('X 테스트 데이터 :')
print(X_test)

X 훈련 데이터 :
[[ 1.22284067  0.07538454  0.80425146 ... -0.88818021 -0.65447227
  -0.67630478]
 [ 0.99644905 -1.42719549  0.96327176 ... -1.06553473 -0.1901694
  -0.67630478]
 [ 1.2544302  -0.0319426  -0.38840077 ... -0.29699845  0.20780449
   0.31542524]
 ...
 [-1.86240327  3.40252604  0.08866013 ...  3.84127382  4.22070786
   0.16666574]
 [-0.96210171  0.71934741  0.32719057 ...  0.0577106   1.50121962
   1.15839577]
 [ 0.57525534 -0.35392404 -0.46791091 ... -0.71082568 -1.15193963
  -0.92423729]]
X 테스트 데이터 :
[[ 1.13333701e+00 -5.68578328e-01  1.68170275e-01 -6.28159170e-01
   2.76810927e-02 -8.21102034e-01 -1.21730331e-01 -8.04332412e-01
  -1.13356022e+00 -1.22262080e+00 -9.93129118e-01 -3.56116628e-01
  -2.89662873e-01 -5.77131780e-01]
 [ 1.20704591e+00 -4.61251183e-01  1.68170275e-01  1.81570125e-01
   1.19028698e+00 -2.85259352e-01 -1.45500996e-01 -3.49012345e-01
   2.09129207e-01 -3.82850037e-01  4.19409109e-01  8.85365055e-01
  -6.54472270e-01 -3.78785774e-01]
 [ 1.61244486e+00 -3

릿지 회귀 모델의 가중치가 가능한 적게
=> 각 특성들의 쏠림 현상을 방지
라쏘 회귀 가중치가 높이짐
=> 상대적으로 가중치가 낮다 생각되면 0이됨
엘라스틱 넷
절충

릿지 회귀 규제 계수가 1일때

In [11]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0, random_state=seed) # alpha: 규제 파라미터
ridge_reg.fit(X_train, y_train)
score = ridge_reg.score(X_train, y_train)
print(f"결정계수(Ridge): {score:.4f}")

결정계수(Ridge): 0.9919


In [12]:
score = ridge_reg.score(X_test, y_test)
print(f"결정계수(Ridge): {score:.4f}")

결정계수(Ridge): 0.8902


릿지 회귀 규제 계수가 1이하

In [13]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=5.0, random_state=seed) # alpha: 규제 파라미터
ridge_reg.fit(X_train, y_train)
score = ridge_reg.score(X_train, y_train)
print(f"결정계수(Ridge): {score:.4f}")

결정계수(Ridge): 0.9905


In [14]:
score = ridge_reg.score(X_test, y_test)
print(f"결정계수(Ridge): {score:.4f}")

결정계수(Ridge): 0.9013


라쏘 회귀 1일때

In [15]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=1.0, random_state=seed)  # alpha: 규제 파라미터
lasso_reg.fit(X_train, y_train)
score = lasso_reg.score(X_train, y_train)
print(f"결정계수(Lasso): {score:.4f}")

결정계수(Lasso): 0.9783


In [17]:
score = lasso_reg.score(X_test, y_test)
print(f"결정계수(Lasso): {score:.4f}")

결정계수(Lasso): 0.8958


라쏘 회귀 1이하

In [18]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.5, random_state=seed)  # alpha: 규제 파라미터
lasso_reg.fit(X_train, y_train)
score = lasso_reg.score(X_train, y_train)
print(f"결정계수(Lasso): {score:.4f}")

결정계수(Lasso): 0.9883


In [19]:
score = lasso_reg.score(X_test, y_test)
print(f"결정계수(Lasso): {score:.4f}")

결정계수(Lasso): 0.8958


엘라스틱 넷
alpha: 규제 파라미터. 기본값=1.0
l1_ratio: 혼합 비율(r). 기본값=1.0

규제 파라미터를 바꿔가면서 더 나은 결정계수를 찾기 위한 시도를 했다.

In [20]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=1.0, l1_ratio=1.0, random_state=seed)
elastic_net.fit(X_train, y_train)
score = elastic_net.score(X_train, y_train)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.9783


In [21]:
score = elastic_net.score(X_test, y_test)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.8978


alpha: 규제 파라미터. 기본값=0.5
l1_ratio: 혼합 비율(r). 기본값=0.5

In [22]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=seed)
elastic_net.fit(X_train, y_train)
score = elastic_net.score(X_train, y_train)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.9418


In [23]:
score = elastic_net.score(X_test, y_test)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.9189


alpha: 규제 파라미터. 기본값=1.0
l1_ratio: 혼합 비율(r). 기본값=0.5

In [24]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=seed)
elastic_net.fit(X_train, y_train)
score = elastic_net.score(X_train, y_train)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.8800


In [25]:
score = elastic_net.score(X_test, y_test)
print(f"결정계수(ElasticNet): {score:.4f}")

결정계수(ElasticNet): 0.8739


릿지 다항

In [26]:
model = Pipeline([ # 다항 특성 생성 -> 특성 스케일링 -> Ridge
        ('poly_features', PolynomialFeatures(degree=7)),
        ('feat_scaling', StandardScaler()), 
        ('reg', ridge_reg)
    ])
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): 0.9976


In [27]:
score = model.score(X_test, y_test)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): -1.3873


라쏘 다항

In [28]:
model = Pipeline([ # 다항 특성 생성 -> 특성 스케일링 -> Ridge
        ('poly_features', PolynomialFeatures(degree=7)),
        ('feat_scaling', StandardScaler()), 
        ('las', lasso_reg)
    ])
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): 0.9882


In [29]:
score = model.score(X_test, y_test)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): 0.8959


엘라스틱 다항

In [30]:
model = Pipeline([ # 다항 특성 생성 -> 특성 스케일링 -> Ridge
        ('poly_features', PolynomialFeatures(degree=7)),
        ('feat_scaling', StandardScaler()), 
        ('ela', elastic_net)
    ])
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): 0.8934


In [31]:
score = model.score(X_test, y_test)
print(f"결정계수(PolyFeatures-StandardScaling-Ridge): {score:.4f}")

결정계수(PolyFeatures-StandardScaling-Ridge): 0.8411


릿지 회귀 규제 계수가 1일때
결정계수(Ridge): 0.9919
결정계수(Ridge): 0.8902
릿지 회귀 규제 계수가 1이하
결정계수(Ridge): 0.9905
결정계수(Ridge): 0.9013
라쏘 회귀 1일때
결정계수(Lasso): 0.9783
결정계수(Lasso): 0.8978
라쏘 회귀 1이하
결정계수(Lasso): 0.9883
결정계수(Lasso): 0.8958
엘라스틱 넷 alpha: 규제 파라미터. 기본값=1.0 l1_ratio: 혼합 비율(r). 기본값=1.0
결정계수(ElasticNet): 0.9783
결정계수(ElasticNet): 0.8978
alpha: 규제 파라미터. 기본값=0.5 l1_ratio: 혼합 비율(r). 기본값=0.5
결정계수(ElasticNet): 0.9418
결정계수(ElasticNet): 0.9189
alpha: 규제 파라미터. 기본값=1.0 l1_ratio: 혼합 비율(r). 기본값=0.5
결정계수(ElasticNet): 0.8800
결정계수(ElasticNet): 0.8739
릿지 다항
결정계수(PolyFeatures-StandardScaling-Ridge): 0.9976
결정계수(PolyFeatures-StandardScaling-Ridge): -7.8531
라쏘 다항
결정계수(PolyFeatures-StandardScaling-Ridge): 0.9882
결정계수(PolyFeatures-StandardScaling-Ridge): 0.8959
엘라스틱 다항
결정계수(PolyFeatures-StandardScaling-Ridge): 0.8934
결정계수(PolyFeatures-StandardScaling-Ridge): 0.8411