Step 1 - Import Libray

In [3]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB # import Naive Bayes model Gaussian (asumsi data terdistribusi normal)
from sklearn.svm import SVC # import SVM classifier
from sklearn.ensemble import VotingClassifier # import model Voting
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Step 2 - Data Preparation

Step 2.1 - Load Data

In [4]:
# Load Data
dbt = pd.read_csv('diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Step 2.2 - Check Columns Name

In [5]:
# Check columns name
dbt.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

Step 2.3 - Check Null Values

In [6]:
# Check null values
dbt.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Step 2.4 - Data Imputation

In [7]:
# Check column with 0 value
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")
    
# Impute 0 with mean value
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


Step 3 - Split Data

In [8]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Step 4 - Build Logistic Regression Model

Step 4.1 - Standarized The Features

In [9]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

Step 4.2 - Train and Evaluate The Model

In [10]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
logistic_regression = LogisticRegression()

# Fit the model with standardized features
logistic_regression.fit(X_train_std, y_train)

# Predict the test set
y_pred_lr = logistic_regression.predict(X_test_std)

# Evaluate the test set
acc_lr = accuracy_score(y_test, y_pred_lr)

# Print the result
print("Test set accuracy using Logistic Regression: {:.2f}".format(acc_lr))
print(f"Test set accuracy using Logistic Regression: {acc_lr}")


Test set accuracy using Logistic Regression: 0.74
Test set accuracy using Logistic Regression: 0.7359307359307359


Step 5 - Build SVM Polynomial Model

In [12]:
svm_poly = SVC(kernel='poly')

# Fitting
svm_poly.fit(X_train_std, y_train)

# Predict
y_pred_svm_poly = svm_poly.predict(X_test_std)

# Evaluate test set
acc_svm_poly = accuracy_score(y_test, y_pred_svm_poly)

# Show the result
print("Test set accuracy: {:.2f}".format(acc_svm_poly))
print(f"Test set accuracy: {acc_svm_poly}")

Test set accuracy: 0.70
Test set accuracy: 0.696969696969697


Step 6 - Build SVM RBF Model

In [None]:
svm_rbf = SVC(kernel='rbf')

# Fitting
svm_rbf.fit(X_train_std, y_train)

# Predict
y_pred_svm_rbf = svm_rbf.predict(X_test_std)

# Evaluate test set
acc_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)

# Print
print("Test set accuracy: {:.2f}".format(acc_svm_rbf))
print(f"Test set accuracy: {acc_svm_rbf}")

Test set accuracy: 0.72
Test set accuracy: 0.7229437229437229


Step 7 - Decision Tree

In [13]:
# By default, DT in scikit-learn will use "Gini" as split criteria
# Read the documentation for more detail
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier()

# Fitting / train DT model
dt.fit(X_train, y_train)

# Predict test set
y_pred_dt = dt.predict(X_test)

#  Calculate the accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 0.72
Test set accuracy: 0.7229437229437229


Step 8 - Build Voting Model

In [16]:
from sklearn.pipeline import Pipeline

# Define classifiers
clf1 = ('logistic_regression', LogisticRegression())
clf2 = ('svm_poly', SVC(kernel='poly'))
clf3 = ('decision_tree', DecisionTreeClassifier())

# Create a pipeline with StandardScaler and the classifier
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    clf1  # Classifier
])

# Create the Voting Classifier with 'hard' voting
voting = VotingClassifier(estimators=[clf1, clf2, clf3], voting='hard')

# Fit the ensemble on the training data
voting.fit(X_train_std, y_train)

# Predict on the test set
y_pred_vt1 = voting.predict(X_test_std)

# Evaluate the ensemble on the test data
acc_vt1 = accuracy_score(y_test, y_pred_vt1)

# Print the results
print('Voting Hard')
print("Test set accuracy: {:.2f}".format(acc_vt1))
print(f"Test set accuracy: {acc_vt1}")


Voting Hard
Test set accuracy: 0.74
Test set accuracy: 0.7402597402597403
