In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
data_url = "https://storage.googleapis.com/ai-experts/diabetes.csv"

# Load data
df = pd.read_csv(data_url)

In [46]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [None]:
df = df.drop(df[(df['BMI'] < 15) | (df['Insulin'] == 0) | (df['SkinThickness'] == 0)].index)

In [None]:
# Features and label
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [None]:
y.value_counts()

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Train-test split for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Set up KNN and parameter grid
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)

In [None]:
knn.predict(X_test)

In [None]:
X_test[0]

In [None]:
distances, indices = knn.kneighbors(X_test[:1], n_neighbors=5)

In [None]:
distances

In [None]:
indices

In [None]:
X_test[0]

In [None]:
X_train[282]

In [None]:
scaler.inverse_transform(X_test[:1])

In [None]:
scaler.inverse_transform(X_train[282:283])

In [None]:
metrics.confusion_matrix(y_test, knn.predict(X_test))

In [None]:
print(metrics.classification_report(y_test, knn.predict(X_test)))

In [None]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': list(range(1, 31)),
    'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine'],
    'algorithm': ['brute']  # Required for 'cosine'
}

# Run GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get results into DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Plot results using Plotly
fig = px.line(
    results,
    x='param_n_neighbors',
    y='mean_test_score',
    color='param_metric',
    markers=True,
    title='KNN: Accuracy vs. K by Distance Metric',
    labels={'param_n_neighbors': 'K (Number of Neighbors)', 'mean_test_score': 'CV Accuracy'}
)
fig.update_layout(template='plotly_white')
fig.show()

# Report best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_acc = metrics.accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Best Model: {test_acc:.4f}")

In [None]:
best_knn = KNeighborsClassifier(n_neighbors=23, metric='euclidean')
best_knn.fit(X_train, y_train)
print(metrics.confusion_matrix(y_test, best_knn.predict(X_test)))
print(metrics.classification_report(y_test, best_knn.predict(X_test)))

## Adding weights

In [None]:
# Define KNN and parameter grid
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': list(range(1, 31)),
    'metric': ['euclidean', 'manhattan', 'minkowski', 'cosine'],
    'weights': ['uniform', 'distance'],
    'algorithm': ['brute']  # Required for 'cosine'
}

# Grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Results as DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# Plot accuracy vs K, colored by metric and line-style by weights
fig = px.line(
    results,
    x='param_n_neighbors',
    y='mean_test_score',
    color='param_metric',
    line_dash='param_weights',
    markers=True,
    title='KNN: Accuracy vs. K by Metric and Weight Type',
    labels={
        'param_n_neighbors': 'K (Number of Neighbors)',
        'mean_test_score': 'Cross-Validated Accuracy',
        'param_metric': 'Distance Metric',
        'param_weights': 'Weight Type'
    }
)
fig.update_layout(template='plotly_white')
fig.show()

# Report best model
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# Final model evaluation on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_acc = metrics.accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Best Model: {test_acc:.4f}")

# Confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
labels = ['No Diabetes', 'Diabetes']

fig_cm = ff.create_annotated_heatmap(
    z=cm,
    x=labels,
    y=labels,
    colorscale='Blues',
    showscale=True,
    hoverinfo="z",
    annotation_text=cm.astype(str),
)
fig_cm.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted',
    yaxis_title='Actual',
    template='plotly_white'
)
fig_cm.show()

**Exercise: Try performing grid search CV, but with different metric - more suitable for diabetes problem**

- select more suitable metric (or try with different metrics)
- try different **p** params as well (for Minkowsky distance)