In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv(r"C:\Dump\DataSets\Classification\KNNAlgorithmDataset.csv")
X_raw = data.drop('diagnosis', axis=1)
X_numeric = X_raw.apply(pd.to_numeric, errors='coerce')

In [3]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X_numeric)



In [4]:
if not np.isfinite(X).all():
    raise ValueError("Non-numeric or infinite values still exist in X!")

In [5]:
y = data['diagnosis'].astype(str).values

In [6]:
X_vis = X[:, :2]


In [7]:
scaler = StandardScaler()
X_vis = scaler.fit_transform(X_vis)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vis, y, test_size=0.2, random_state=42, shuffle=True
)

In [None]:
def plot_knn(X_train, y_train, X_test, y_test, k):
    # Train KNN classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    # Create meshgrid for background
    x_min, x_max = X_train[:,0].min() - 1, X_train[:,0].max() + 1
    y_min, y_max = X_train[:,1].min() - 1, X_train[:,1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05),
                         np.arange(y_min, y_max, 0.05))
    
    # Predict for each point on the grid
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Define colors
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    colors_dict = {'B': 'blue', 'M': 'red'}
    
    # Plot decision boundary
    plt.figure(figsize=(10,6))
    plt.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.5)
    
    # Plot training points
    for label, color in colors_dict.items():
        plt.scatter(
            X_train[y_train==label,0], X_train[y_train==label,1],
            c=color, label=f'Train {label}', edgecolor='k', s=80
        )
    
    # Plot test points
    for label, color in colors_dict.items():
        plt.scatter(
            X_test[y_test==label,0], X_test[y_test==label,1],
            c=color, marker='*', label=f'Test {label}', s=150
        )
    
    plt.title(f'KNN Decision Boundary (k={k})')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
for k in [1, 3, 5, 10]:
    plot_knn(X_train, y_train, X_test, y_test, k)