In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

df = pd.read_csv('diabetes.csv')

# Handle missing values (if any)
df.fillna(df.mean(), inplace=True)

features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = df[features].values
y = df['Outcome'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_knn = grid_search.best_estimator_

# Make predictions
y_pred = best_knn.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:", acc)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Class predictions for new data
new_data = np.array([[5, 110, 90, 30, 2, 36, 0.17, 40],
                     [0, 137, 40, 35, 168, 43.1, 2.288, 33]])
new_data_scaled = scaler.transform(new_data)
class_predictions = best_knn.predict(new_data_scaled)
print("Class Predictions for New Data:", class_predictions)


Confusion Matrix:
[[121  30]
 [ 40  40]]
Accuracy Score: 0.696969696969697
F1 Score: 0.5333333333333333
Class Predictions for New Data: [0 1]
