In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [21]:
file_path = "D:/AI_data/diabetes.csv"

df = pd.read_csv(file_path)
print(df.head())
print(df.columns)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [22]:
# 데이터 분할
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
# 결정 트리 학습 및 평가
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train) # 학습
dt_pred = dt_model.predict(X_test) # 예측
dt_accuracy = accuracy_score(y_test, dt_pred) # 정답률
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

# 비교 (상위 5개 출력)
print("실제 값 (y_test) :\n", y_test[:5])
print("예측 값 (dt_pred) : ", dt_pred[:5])

Decision Tree Accuracy: 0.7532
실제 값 (y_test) :
 668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64
예측 값 (dt_pred) :  [0 0 0 0 0]


In [25]:
# 랜덤 포레스트 학습 및 평가
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# 비교 (상위 5개 출력)
print("실제 값 (y_test) :\n", y_test[:5])
print("예측 값 (rf_pred) : ", rf_pred[:5])

Random Forest Accuracy: 0.7273
실제 값 (y_test) :
 668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64
예측 값 (rf_pred) :  [0 0 0 0 0]


In [26]:
# SVM 학습 및 평가
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# 비교 (상위 5개 출력)
print("실제 값 (y_test) :\n", y_test[:5])
print("예측 값 (svm_pred) : ", svm_pred[:5])

SVM Accuracy: 0.7338
실제 값 (y_test) :
 668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64
예측 값 (svm_pred) :  [0 0 0 0 0]


In [27]:
# 로지스틱 회귀 학습 및 평가
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

# 비교 (상위 5개 출력)
print("실제 값 (y_test) :\n", y_test[:5])
print("예측 값 (lr_pred) : ", lr_pred[:5])

Logistic Regression Accuracy: 0.7532
실제 값 (y_test) :
 668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64
예측 값 (lr_pred) :  [0 0 0 0 0]


In [28]:
# KNN 학습 및 평가
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
print(f"KNN Model Accuracy: {knn_accuracy:.4f}")

# 비교 (상위 5개 출력)
print("실제 값 (y_test) :\n", y_test[:5])
print("예측 값 (lr_pred) : ", knn_pred[:5])

KNN Model Accuracy: 0.6948
실제 값 (y_test) :
 668    0
324    0
624    0
690    0
473    0
Name: Outcome, dtype: int64
예측 값 (lr_pred) :  [0 0 0 0 1]
