In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [79]:
data_path = 'datasets-jadi.csv'
df = pd.read_csv(data_path)

In [80]:
df.head()

Unnamed: 0,NO,NAMA,USIA,PARITAS,JARAK KELAHIRAN,PE/Non PE,RIW HIPERTENSI_Tidak,RIW HIPERTENSI_Ya,RIW PE_Ada,RIW PE_Tidak,OBESITAS_Tidak,OBESITAS_Ya,RIW DM_Tidak,RIW DM_Ya,RIW HIPERTENSI/PE DALAM KELUARGA_Ada,RIW HIPERTENSI/PE DALAM KELUARGA_Tidak,SOSEK RENDAH_<UMR,SOSEK RENDAH_>UMR
0,1,NAMA 1,23.0,3,0,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,2,NAMA 2,29.0,2,0,1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,3,NAMA 3,20.0,1,2,0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,4,NAMA 4,18.0,1,2,0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,5,NAMA 5,34.0,3,1,0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [81]:
df.isnull().any()
df.isnull().sum()

NO                                        0
NAMA                                      0
USIA                                      0
PARITAS                                   0
JARAK KELAHIRAN                           0
PE/Non PE                                 0
RIW HIPERTENSI_Tidak                      0
RIW HIPERTENSI_Ya                         0
RIW PE_Ada                                0
RIW PE_Tidak                              0
OBESITAS_Tidak                            0
OBESITAS_Ya                               0
RIW DM_Tidak                              0
RIW DM_Ya                                 0
RIW HIPERTENSI/PE DALAM KELUARGA_Ada      0
RIW HIPERTENSI/PE DALAM KELUARGA_Tidak    0
SOSEK RENDAH_<UMR                         0
SOSEK RENDAH_>UMR                         0
dtype: int64

In [82]:
# Drop irrelevant columns (if present) and define features and target
df_cleaned = df.drop(columns=['NO', 'NAMA', 'Unnamed: 12'], errors='ignore')
X = df_cleaned.drop(columns=['PE/Non PE'], errors='ignore')
y = df_cleaned['PE/Non PE']

In [83]:
# Convert categorical columns to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

In [84]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Initialize the models
nb_model = GaussianNB()
knn_model = KNeighborsClassifier(n_neighbors=5)
dt_model = DecisionTreeClassifier(random_state=42)

In [86]:
# Train and evaluate Naive Bayes
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

In [87]:
# Train and evaluate K-Nearest Neighbors
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

In [88]:
# Train and evaluate Decision Tree
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

{'Naive Bayes': array([[96,  6],
       [ 2,  9]]), 'K-Nearest Neighbors': array([[101,   1],
       [ 11,   0]]), 'Decision Tree': array([[98,  4],
       [ 6,  5]])}


In [123]:
# Generate confusion matrices for each model
confusion_matrices = {
    'Naive Bayes': confusion_matrix(y_test, y_pred_nb),
    'K-Nearest Neighbors': confusion_matrix(y_test, y_pred_knn),
    'Decision Tree': confusion_matrix(y_test, y_pred_dt)
}

In [124]:
for model_name, cm in confusion_matrices.items():
    # Calculate accuracy for each model
    if model_name == 'Naive Bayes':
        accuracy = accuracy_score(y_test, y_pred_nb)
    elif model_name == 'K-Nearest Neighbors':
        accuracy = accuracy_score(y_test, y_pred_knn)
    else:
        accuracy = accuracy_score(y_test, y_pred_dt)
    
    accuracy_results[model_name] = accuracy
    

In [125]:
accuracy_df = pd.DataFrame(list(accuracy_results.items()), columns=['Model', 'Accuracy (70/30 Split)'])
print(accuracy_df)

                 Model  Accuracy (70/30 Split)
0          Naive Bayes                0.929204
1  K-Nearest Neighbors                0.893805
2        Decision Tree                0.911504


{'Naive Bayes': array([[96,  6],
       [ 2,  9]]), 'K-Nearest Neighbors': array([[101,   1],
       [ 11,   0]]), 'Decision Tree': array([[98,  4],
       [ 6,  5]])}


In [76]:
# Calculate accuracy for each model with 70/30 split
nb_accuracy_70_30 = accuracy_score(y_test, y_pred_nb)
knn_accuracy_70_30 = accuracy_score(y_test, y_pred_knn)
dt_accuracy_70_30 = accuracy_score(y_test, y_pred_dt)

# Compile accuracy results into a DataFrame for comparison
accuracy_results_70_30 = {
    'Model': ['Naive Bayes', 'K-Nearest Neighbors', 'Decision Tree'],
    'Accuracy (70/30 Split)': [nb_accuracy_70_30, knn_accuracy_70_30, dt_accuracy_70_30]
}

# Display the results
accuracy_df = pd.DataFrame(accuracy_results_70_30)

In [77]:
accuracy_df

Unnamed: 0,Model,Accuracy (70/30 Split)
0,Naive Bayes,0.929204
1,K-Nearest Neighbors,0.893805
2,Decision Tree,0.911504


In [126]:
classification_reports = {
    'Naive Bayes': classification_report(y_test, y_pred_nb),
    'K-Nearest Neighbors': classification_report(y_test, y_pred_knn),
    'Decision Tree': classification_report(y_test, y_pred_dt)
}

In [127]:
# Display classification report for each model
for model_name, report in classification_reports.items():
    print(f"Classification Report for {model_name} (70/30 Split):\n")
    print(report)
    print("\n" + "="*60 + "\n")

Classification Report for Naive Bayes (70/30 Split):

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       102
           1       0.60      0.82      0.69        11

    accuracy                           0.93       113
   macro avg       0.79      0.88      0.83       113
weighted avg       0.94      0.93      0.93       113



Classification Report for K-Nearest Neighbors (70/30 Split):

              precision    recall  f1-score   support

           0       0.90      0.99      0.94       102
           1       0.00      0.00      0.00        11

    accuracy                           0.89       113
   macro avg       0.45      0.50      0.47       113
weighted avg       0.81      0.89      0.85       113



Classification Report for Decision Tree (70/30 Split):

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       102
           1       0.56      0.45      0.50        11

    ac