In [1]:
import pandas as pd

import sklearn.metrics as mt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>

In [2]:
data = pd.read_csv('data/05_diabetes.csv')
data.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


<p style="background-image: linear-gradient(#0aa98f, #FFFFFF 10%)"> &nbsp; </p>

In [4]:
y = data['Outcome']
X = data.drop(columns='Outcome')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<p style="background-image: linear-gradient(#0aa98f, #FFFFFF 10%)"> &nbsp; </p>

In [5]:
def get_model_accuracy(model):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = mt.accuracy_score(y_test, predictions)
    return accuracy

In [6]:
models = {
    'Logistic': LogisticRegression(random_state=0), 
    'KNN': KNeighborsClassifier(), 
    'Support Vector': SVC(random_state=0), 
    'Naive Bayesian': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0)
}

results = pd.DataFrame()
for name, model in models.items():
    accuracy = get_model_accuracy(model)
    row = pd.Series([name, accuracy])
    results = pd.concat([results, row], axis=1)

results = results.T
results.columns = ['Model', 'Accuracy']
results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,Model,Accuracy
0,Logistic,0.824675
1,KNN,0.798701
2,Support Vector,0.792208
3,Naive Bayesian,0.792208
4,Decision Tree,0.766234
5,Random Forest,0.785714


<p style="background-image: linear-gradient(to right, #0aa98f, #68dab2)"> &nbsp; </p>