In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 1) 데이터 준비 (회귀용)
# -----------------------------
# 예시: 당뇨병 회귀 데이터셋 (다운로드 없이 사용 가능)
diab = load_diabetes(as_frame=True)
df = pd.concat([diab.data, diab.target.rename("target")], axis=1).dropna()

# outcome을 삭제하고 BMI를 예측하기 위해 데이터 분리
X = df.drop(columns=["target", "bmi"]) # outcome(target)과 BMI 컬럼 제외
y = df["bmi"]  # 예측 목표를 BMI로 설정

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 2) 모델 구성 (회귀 모델)
# -----------------------------
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

# 선형회귀는 스케일링과 함께 파이프라인 구성 권장
lr = make_pipeline(StandardScaler(with_mean=True, with_std=True), LinearRegression())

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가 함수
# -----------------------------
def eval_reg(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    # Calculate RMSE by taking the square root of MSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# -----------------------------
# 5) 예측 및 평가
# -----------------------------
dt_mae, dt_rmse, dt_r2 = eval_reg(y_test, dt.predict(X_test))
rf_mae, rf_rmse, rf_r2 = eval_reg(y_test, rf.predict(X_test))
lr_mae, lr_rmse, lr_r2 = eval_reg(y_test, lr.predict(X_test))

print("=== Test Metrics (Predicting BMI) ===")
print("[Decision Tree]")
print(f"MAE: {dt_mae:.3f} | RMSE: {dt_rmse:.3f} | R^2: {dt_r2:.3f}")

print("[Random Forest]")
print(f"MAE: {rf_mae:.3f} | RMSE: {rf_rmse:.3f} | R^2: {rf_r2:.3f}")

print("[Linear Regression]")
print(f"MAE: {lr_mae:.3f} | RMSE: {lr_rmse:.3f} | R^2: {lr_r2:.3f}")

# -----------------------------
# 6) BMI 예측값 출력 (예시)
# -----------------------------
print("\n=== Sample BMI Predictions ===")
print("Decision Tree Predicted BMI:", dt.predict(X_test.iloc[:5]))
print("Random Forest Predicted BMI:", rf.predict(X_test.iloc[:5]))
print("Linear Regression Predicted BMI:", lr.predict(X_test.iloc[:5]))
print("Actual BMI:", y_test.iloc[:5].values)

=== Test Metrics (Predicting BMI) ===
[Decision Tree]
MAE: 0.041 | RMSE: 0.056 | R^2: -0.329
[Random Forest]
MAE: 0.035 | RMSE: 0.045 | R^2: 0.124
[Linear Regression]
MAE: 0.034 | RMSE: 0.043 | R^2: 0.196

=== Sample BMI Predictions ===
Decision Tree Predicted BMI: [-0.00620595  0.00672779 -0.02560657  0.05630715  0.02073935]
Random Forest Predicted BMI: [ 0.01323059  0.00240217 -0.0020923   0.0538677  -0.00219649]
Linear Regression Predicted BMI: [ 0.01414963  0.0033373  -0.01068099  0.07626833 -0.00175662]
Actual BMI: [-0.00620595  0.03690653 -0.00405033  0.0519959  -0.02021751]
