In [1]:
import pandas as pd
import numpy as np

#All methods we need to use to implement our model. 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score


In [7]:
#Step 1: Import our data
df = pd.read_csv("diabetes.csv")

#Step 2: We will create a list of columns which cannot have zeros

zeros_not_accepted = ["Glucose","BloodPressure","SkinThickness","BMI","Insulin"]
for column in zeros_not_accepted:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna = True))
    df[column] = df[column].replace(np.NaN, mean)

#Step 3: Split dataset
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

#Step 4: Feature scaling the labeled data

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

#Step 5: Define the model and train the model

classifier = KNeighborsClassifier(n_neighbors = 11, p=2, metric = "euclidean")
classifier.fit(X_train, y_train)

#Step 6: Evaluate the model (confusion, accuracy, f1-Score)

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

#F1-Score: more telling than the accurracy
print(f1_score(y_test, y_pred))

[[95 12]
 [18 29]]
0.6590909090909092
