In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.read_csv('https://github.com/MyungKyuYi/AI-class/raw/refs/heads/main/diabetes.csv')

# 결측치 확인
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [45]:
# Outcome 열 제거
df = df.drop('Outcome',axis=1)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [46]:
# 특징 및 레이블 분리
X = df.drop(columns=['BMI'])
y = df['BMI']

In [47]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,0.627,50
1,1,85,66,29,0,0.351,31
2,8,183,64,0,0,0.672,32
3,1,89,66,23,94,0.167,21
4,0,137,40,35,168,2.288,33
...,...,...,...,...,...,...,...
763,10,101,76,48,180,0.171,63
764,2,122,70,27,0,0.340,27
765,5,121,72,23,112,0.245,30
766,1,126,60,0,0,0.349,47


In [48]:
y

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64

In [49]:
# 학습, 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=1557)
print("학습 데이터 개수:", len(X_train))
print("테스트 데이터 개수:", len(X_test))

학습 데이터 개수: 614
테스트 데이터 개수: 154


In [50]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(614, 7) (154, 7)
(614,) (154,)


In [51]:
# Desision Tree Regressor 모델 생성 및 학습
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# 평균제곱근오차(MSE) 계산
# sklearn 라이브러리
print('평균제곱근오차', mean_squared_error(dt_pred, y_test))

# numpy
def mse_np(actual, pred):
    return np.mean((np.array(actual) - np.array(pred)) ** 2)
print('평균제곱근오차', mse_np(dt_pred, y_test))

# for
def mse(actual, pred):
    sum_square_error = sum((a - p) ** 2 for a, p in zip(actual, pred))
    mean_squared_error = sum_square_error / len(actual)
    return mean_squared_error
print('평균제곱근오차', mse(dt_pred, y_test))

평균제곱근오차 79.43474025974025
평균제곱근오차 79.43474025974025
평균제곱근오차 79.43474025974027


In [52]:
# Random Forest Regressor 모델 생성 및 학습
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print('평균제곱근오차', mean_squared_error(rf_pred, y_test))

평균제곱근오차 39.290574772727254


In [53]:
# SVR 모델 생성 및 학습
svr_model = SVR(kernel='linear')
svr_model.fit(X_train, y_train)
svr_pred = svr_model.predict(X_test)

print('평균제곱근오차', mean_squared_error(svr_pred, y_test))

평균제곱근오차 52.41629955996878


In [54]:
# Linear Regression 모델 생성 및 학습
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

print('평균제곱근오차', mean_squared_error(lr_pred, y_test))

평균제곱근오차 52.03446984858313
