In [2]:
from sklearn.datasets import load_diabetes

dibt = load_diabetes()
print(dibt.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [4]:
dibt.data.shape

(442, 10)

In [5]:
dibt.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dibt.data,
                                                    dibt.target,
                                                    random_state=3)

print(x_train.shape, y_test.shape)

(331, 10) (111,)


In [8]:
from sklearn.linear_model import LinearRegression

dibt_lr = LinearRegression()
dibt_lr.fit(x_train, y_train)
pred = dibt_lr.predict(x_test)

print(pred[:10])
print(y_test[:10])

[ 96.93071611 180.50531222 104.99346721 123.06727745 185.6971835
  80.55682061 126.1862135  233.66341201 156.82400764 170.55080528]
[ 53.  85.  47.  84. 174.  42.  44. 195. 150. 121.]


In [9]:
dibt_lr.coef_, dibt_lr.intercept_ # coef_: 각 변수의 가중치, intercept_: bias

(array([  -1.28406745, -245.942366  ,  542.4812886 ,  332.41309796,
        -869.22384677,  471.38808685,  172.22381203,  334.86353979,
         697.42259397,   66.83610853]),
 np.float64(153.47873564418862))

In [11]:
x_test[0]

array([-0.02730979,  0.05068012, -0.01590626, -0.02977038,  0.00393485,
       -0.00068758,  0.04127682, -0.03949338, -0.02364686,  0.01134862])

In [12]:
x_test[0]*dibt_lr.coef_

array([  0.03506761, -12.46438831,  -8.62884994,  -9.89606395,
        -3.42026686,  -0.32411726,   7.10885195, -13.22489399,
       -16.4918566 ,   0.75849781])

In [14]:
import numpy as np

np.sum(x_test[0]*dibt_lr.coef_) + dibt_lr.intercept_

np.float64(96.9307161084729)

**선형 회귀(Linear Regression)**

✅ 한계

- 입력과 출력 간의 관계가 **선형이 아닐 경우 성능이 낮음**
- **이상치(outlier)에 민감**
- 변수 간 **다중공선성(multicollinearity)** 문제 발생 가능

✅ 개선하려면?

- **Ridge** (L2 정규화)
- **Lasso** (L1 정규화)
- **ElasticNet** (L1 + L2 혼합)
- **Polynomial Regression** (비선형 확장)

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

dibt_knn = KNeighborsRegressor(n_neighbors=5)
dibt_knn.fit(x_train, y_train)
pred = dibt_knn.predict(x_test)
print(pred[:10])
print(y_test[:10])

[ 64.8 158.   66.6 141.6 218.2  80.6 126.6 212.2 137.4 141.6]
[ 53.  85.  47.  84. 174.  42.  44. 195. 150. 121.]


In [16]:
#  회귀 모델의 성능을 평가하기 위한 지표 예측값 맞는지 비교
from sklearn.metrics import mean_squared_error, r2_score
# 회귀 모델의 성능을 평가하는 오차지표 : 모델이 얼마나 정확한지 평가할 수 있는 수치
# mean_squared_error(MSE)는 0에 가까울 수록 r2_score는 1에 가까울수록 예측이 잘 된 것

print(mean_squared_error(y_test, pred))
print(r2_score(y_test, pred))

3630.461261261261
0.3134084132690441


In [18]:
for k in range(1, 100, 2):
  dibt_knn = KNeighborsRegressor(n_neighbors=k)
  dibt_knn.fit(x_train, y_train)
  pred = dibt_knn.predict(x_test)
  print(k, mean_squared_error(y_test, pred), r2_score(y_test, pred))

1 5549.594594594595 -0.04953742354284163
3 3689.027027027027 0.3023324757637197
5 3630.461261261261 0.3134084132690441
7 3629.025556168413 0.3136799333230643
9 3459.5618952285618 0.34572884267226023
11 3416.5057702330428 0.35387160224240066
13 3308.716989178528 0.3742565794919044
15 3393.451371371372 0.3582316422363446
17 3447.8297016739916 0.34794763108750093
19 3367.0896658431284 0.36321715893106976
21 3343.6002737431313 0.3676594646968334
23 3338.926054599023 0.3685434514158562
25 3330.335365765766 0.37016811953725315
27 3240.1145095713 0.3872306568714935
29 3233.864489935834 0.3884126584381824
31 3167.7123491858138 0.4009233285747442
33 3213.1239090329996 0.3923351099745064
35 3251.5159845559847 0.38507441383859353
37 3258.1943616370213 0.38381140145893144
39 3290.5732181886033 0.37768792322945677
41 3282.395383485806 0.3792345125195091
43 3258.68414872417 0.3837187730932431
45 3268.7812256701145 0.3818092173693921
47 3309.9243593978767 0.37402824205073126
49 3311.3831774298246 0.3

In [21]:
from sklearn.tree import DecisionTreeRegressor

dibt_dt = DecisionTreeRegressor()
dibt_dt.fit(x_train, y_train)
pred = dibt_dt.predict(x_test)
print(mean_squared_error(y_test, pred))

6368.603603603604


In [22]:
for dept in range(1, 10):
  dibt_dt = DecisionTreeRegressor(max_depth=dept)
  dibt_dt.fit(x_train, y_train)
  pred = dibt_dt.predict(x_test)
  print(dept,mean_squared_error(y_test, pred))
  # 결과를 보면 depth=2 이후부터 오차가 증가하므로 검증 성능 기준으로는 오버피팅이 발생한 것으로 판단

1 3885.946828612115
2 3360.0386656653122
3 3399.2215752562183
4 3852.973241168999
5 4380.904367476516
6 4717.027068881024
7 4826.638277447085
8 5257.697381220506
9 5402.901382832359


In [24]:
from sklearn.linear_model import SGDRegressor

dibt_sgd = SGDRegressor()
dibt_sgd.fit(x_train, y_train)
pred = dibt_sgd.predict(x_test)
print(mean_squared_error(y_test, pred))

3519.3284993393722




In [25]:
dibt_sgd = SGDRegressor(max_iter=10000)
dibt_sgd.fit(x_train, y_train)
pred = dibt_sgd.predict(x_test)
print(mean_squared_error(y_test, pred))

3069.5312864556386


선형 회귀에서 예측 에러가 크다면 비선형 항을 추가하거나 모델 복잡도를 높여 개선할 수 있다.

하지만 **너무 많은 항을 추가하면 과적합(overfitting)**이 발생할 수 있다.

이를 방지하기 위해 **규제(Regularization)**를 추가하여
모델이 과하게 복잡해지는 것을 막는다.

규제항이 추가된 회귀 모델
1. 릿지 (Ridge)
  - 계수의 제곱합에 패널티 부여
  - 계수 크기를 비교적 부드럽게 줄임
2. 라쏘 (Rasso)
  - 계수의 절댓값 합에 패널티 부여
  - 불필요한 계수를 0으로 만들어 제거
3. 엘라스틱넷 (ElassticNet) : 릿지 + 라쏘

In [26]:
from sklearn.linear_model import Lasso

dibt_las = Lasso()
dibt_las.fit(x_train, y_train)
pred = dibt_las.predict(x_test)
print(mean_squared_error(y_test, pred))

3479.8733814922875


In [27]:
# Lasso 회귀와 Ridge 회귀의 alpha 값(규제 강도)에 따라 모델의 성능(MSE)이 어떻게 달라지는지를 비교
for alp in np.arange(0, 1, 0.1):
  dibt_las = Lasso(alpha=alp)
  dibt_las.fit(x_train, y_train)
  pred = dibt_las.predict(x_test)
  print(alp, mean_squared_error(y_test, pred))

0.0 3033.490110937061
0.1 2975.7133689791117
0.2 3010.6480740096567
0.30000000000000004 3057.849561254514
0.4 3077.3369538599177
0.5 3108.364369563612
0.6000000000000001 3154.4014925140523
0.7000000000000001 3214.531496782593
0.8 3288.753410451201
0.9 3377.067487054882


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [28]:
from sklearn.linear_model import Ridge
for alp in np.arange(0, 1, 0.1):
  dibt_rid = Ridge(alpha=alp)
  dibt_rid.fit(x_train, y_train)
  pred = dibt_rid.predict(x_test)
  print(alp, mean_squared_error(y_test, pred))

0.0 3033.490111389726
0.1 3034.835572273501
0.2 3063.631156761543
0.30000000000000004 3103.8202906354472
0.4 3148.0234251552124
0.5 3192.966535364407
0.6000000000000001 3237.1485902742334
0.7000000000000001 3279.8969553505267
0.8 3320.943515257563
0.9 3360.2213075847703


In [38]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

for nest in range(50, 300, 30):
  dibt_rf = RandomForestRegressor(n_estimators=nest)
  dibt_rf.fit(x_train, y_train)
  pred = dibt_rf.predict(x_test)
  print(nest, mean_squared_error(y_test, pred))

50 3226.176162162162
80 3369.568375563062
110 3210.7999940436302
140 3182.1558544769264
170 3314.6512450512796
200 3237.1856563063066
230 3195.485241403975
260 3303.261141718642
290 3242.3423458773887


In [40]:
for nest in range(50, 300, 30):
  dibt_ab = AdaBoostRegressor(n_estimators=nest)
  dibt_ab.fit(x_train, y_train)
  pred = dibt_ab.predict(x_test)
  print(nest, mean_squared_error(y_test, pred))

50 3565.530279830909
80 3895.4791549744887
110 3514.1227801568516
140 3741.3358493976557
170 3442.420261607135
200 3521.6501329523253
230 3611.121457600679
260 3648.514585101995
290 3609.4744059659733


## ✅ 수치예측 모델 vs 범주예측 모델

| 구분 | 수치예측 모델 (Regression) | 범주예측 모델 (Classification) |
| --- | --- | --- |
| 🎯 목표 | 연속된 수치 예측 (예: 가격, 점수) | 범주(class, label) 예측 (예: 스팸/햄, 질병 유/무) |
| 📈 대표 알고리즘 | Linear Regression, Lasso, Ridge, RandomForestRegressor 등 | Logistic Regression, RandomForestClassifier, SVM, KNN 등 |
| 🎓 예시 | - 주가 예측- 온도 예측- 집값 예측 | - 이메일이 스팸인지- 고객이 이탈할지- 암 진단 결과 |
| 🧪 평가 지표 | MSE, RMSE, MAE, R² 등 | Accuracy, Precision, Recall, F1-score, AUC 등 |
| 💬 예측 결과 | 실수 값 (예: 3125.4) | 클래스 라벨 (예: 0, 1) |
| 🔄 사용 시점 | 결과가 **수치형**일 때 | 결과가 **범주형(클래스)**일 때 |