In [1]:
# Step 1: Load necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

# Step 2: Load the dataset into a Pandas DataFrame
file_path = "/Users/refilwemaleka/Desktop/Mabili/Data Science/Assignments/Module 3 Assignment/Part II/diabetes.csv"
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to verify that it has been loaded correctly
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Check for imbalanced classes
class_counts = df['Outcome'].value_counts()
print("Class Counts:\n", class_counts)

Missing Values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
Class Counts:
 Outcome
0    500
1    268
Name: count, dtype: int64


In [3]:
# Separate features and target variable
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data
X_test_scaled = scaler.transform(X_test)

In [5]:
# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Fit the classifier to the scaled training data
knn_classifier.fit(X_train_scaled, y_train)

In [6]:
# Predict labels for the testing set
y_pred = knn_classifier.predict(X_test_scaled)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.6948051948051948

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.77        99
           1       0.58      0.51      0.54        55

    accuracy                           0.69       154
   macro avg       0.66      0.65      0.66       154
weighted avg       0.69      0.69      0.69       154


Confusion Matrix:
 [[79 20]
 [27 28]]


# The output shows the evaluation metrics for the KNN classifier:

# Accuracy: 0.695 (or 69.5%)
# Precision: Precision is higher for class 0 (non-diabetic) at 0.75 compared to class 1 (diabetic) at 0.58. This means that when the classifier predicts a sample as diabetic (class 1), it is correct about 58% of the time.
# Recall: Recall is higher for class 0 at 0.80 compared to class 1 at 0.51. This means that the classifier is better at identifying non-diabetic samples than diabetic samples.
# F1-score: The F1-score is a balance between precision and recall. It's higher for class 0 at 0.77 compared to class 1 at 0.54.
# Support: Indicates the number of samples for each class in the testing set.
# Confusion Matrix: Shows the number of true positives, false positives, true negatives, and false negatives.
# Overall, the classifier has an accuracy of approximately 69.5%. However, it's important to note that the precision, recall, and F1-score for class 1 (diabetic) are lower compared to class 0 (non-diabetic), indicating that the classifier may not perform as well for predicting diabetic samples.