In [5]:
import pandas as pd
import requests
from io import StringIO
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

url = "https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/diabetes.csv"

response = requests.get(url)

data = StringIO(response.text)
df = pd.read_csv(data)

#내 생각에 skinThickness는 결과값에 영향이 없을것이라고 판단.
df = df.drop(columns=["SkinThickness"])

print(df.head())

   Pregnancies  Glucose  BloodPressure  Insulin   BMI  \
0            6      148             72        0  33.6   
1            1       85             66        0  26.6   
2            8      183             64        0  23.3   
3            1       89             66       94  28.1   
4            0      137             40      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [7]:
df.isnull().sum() #결측치 없음.

Pregnancies                 0
Glucose                     0
BloodPressure               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [17]:
X = df.drop(columns=["Outcome"])
Y = df["Outcome"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

models = {
    "KNN": KNeighborsClassifier(n_neighbors=5), 
    "RF": RandomForestClassifier(n_estimators=100, random_state=42),
    "DT": DecisionTreeClassifier(random_state=42), 
    "RL": LogisticRegression(max_iter=1000), 
    "SVM": SVC()
}
from sklearn.metrics import confusion_matrix

for name, model in models.items():
    if name in ["KNN", "SVM"]:
        model.fit(X_train, Y_train)   
        Y_pred = model.predict(X_test)
    else:
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

    accuracy = accuracy_score(Y_test, Y_pred)
    cm = confusion_matrix(Y_test, Y_pred)
    print("confusion_matrix:\n", cm)
    print(f"{name} 정확도: {accuracy:.4f}\n")

confusion_matrix:
 [[117  34]
 [ 29  51]]
KNN 정확도: 0.7273

confusion_matrix:
 [[124  27]
 [ 29  51]]
RF 정확도: 0.7576

confusion_matrix:
 [[107  44]
 [ 26  54]]
DT 정확도: 0.6970

confusion_matrix:
 [[120  31]
 [ 30  50]]
RL 정확도: 0.7359

confusion_matrix:
 [[131  20]
 [ 40  40]]
SVM 정확도: 0.7403

