# KNN Classifier for Breast Cancer Wisconsin Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style for plots
plt.style.use('ggplot')
sns.set(font_scale=1.2)
colors = ["#3498db", "#e74c3c"]

In [3]:
# Load the dataset
print("\nLoading dataset...")
try:
    # Try to load from scikit-learn first 
    from sklearn.datasets import load_breast_cancer
    dataset = load_breast_cancer()
    df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    df['target'] = dataset.target
    print("Dataset loaded from scikit-learn.")
except:
    # If that fails, try loading from local path
    try:
        df = pd.read_csv(r'C:\Course\Breast Cancer Wisconsin Dataset\data.csv')
        print("Dataset loaded from local file 'data.csv'.")
    except:
        print("Error loading dataset. Please ensure 'data.csv' is in the current directory.")
        import sys
        sys.exit(1)


Loading dataset...
Dataset loaded from scikit-learn.


In [4]:
# Check if 'diagnosis' column exists (local CSV format) and convert to target
if 'diagnosis' in df.columns:
    # Some versions of the dataset use 'M' for malignant and 'B' for benign
    if df['diagnosis'].dtype == 'object':
        df['target'] = df['diagnosis'].map({'M': 0, 'B': 1})
    df = df.drop('diagnosis', axis=1)

In [5]:
# Remove any ID column if present
if 'id' in map(str.lower, df.columns):
    id_col = [col for col in df.columns if col.lower() == 'id'][0]
    df = df.drop(id_col, axis=1)
elif 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
elif df.columns[0].lower().startswith('id'):
    df = df.drop(df.columns[0], axis=1)

In [6]:
# Display basic information
print("\n1. Dataset Information:")
print(f"   • Number of samples: {df.shape[0]}")
print(f"   • Number of features: {df.shape[1] - 1}")  # Excluding target column
print(f"   • Target distribution:")
target_counts = df['target'].value_counts()
for label, count in target_counts.items():
    label_name = "Benign" if label == 1 else "Malignant"
    print(f"     - {label_name}: {count} ({count/len(df)*100:.1f}%)")


1. Dataset Information:
   • Number of samples: 569
   • Number of features: 30
   • Target distribution:
     - Benign: 357 (62.7%)
     - Malignant: 212 (37.3%)


In [7]:
# Split dataset into features and target
X = df.drop('target', axis=1)
y = df['target']

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\n2. Data Split:")
print(f"   • Training set: {X_train.shape[0]} samples")
print(f"   • Testing set: {X_test.shape[0]} samples")


2. Data Split:
   • Training set: 455 samples
   • Testing set: 114 samples


In [9]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Visualize feature importance using mean values per class
plt.figure(figsize=(14, 8))
feature_means = df.groupby('target').mean()
for feature in X.columns[:5]:  # Only showing top 5 features to avoid overcrowding
    plt.plot([0, 1], feature_means.loc[:, feature], 'o-', label=feature)
plt.xticks([0, 1], ['Malignant', 'Benign'])
plt.ylabel('Standardized Feature Value')
plt.legend(loc='best')
plt.title('Mean Feature Values by Diagnosis (Top 5 Features)')
plt.grid(True)
plt.savefig('knn_feature_means.png')
plt.close()

In [12]:
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(scaler.transform(X))

plt.figure(figsize=(10, 8))
for target, color in zip([0, 1], colors):
    plt.scatter(X_pca[y == target, 0], X_pca[y == target, 1], 
                c=color, label="Malignant" if target == 0 else "Benign", alpha=0.7)
plt.legend()
plt.title('PCA of Breast Cancer Dataset')
plt.xlabel(f'First Principal Component (Explained Variance: {pca.explained_variance_ratio_[0]:.2f})')
plt.ylabel(f'Second Principal Component (Explained Variance: {pca.explained_variance_ratio_[1]:.2f})')
plt.grid(True)
plt.savefig('knn_pca_visualization.png')
plt.close()

In [13]:
print("\n3. Finding optimal K value for KNN classifier...")
# Find optimal k value
k_range = list(range(1, 31, 2))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    scores.append(knn.score(X_test_scaled, y_test))

plt.figure(figsize=(10, 6))
plt.plot(k_range, scores, 'bo-')
plt.xlabel('Value of K')
plt.ylabel('Testing Accuracy')
plt.title('KNN: Accuracy vs. K Value')
plt.grid(True)
plt.savefig('knn_k_values.png')
plt.close()


3. Finding optimal K value for KNN classifier...


In [14]:
# Find best k value
best_k = k_range[np.argmax(scores)]
print(f"   • Optimal K value: {best_k}")

   • Optimal K value: 3


In [15]:
# Train KNN model with optimal K
print(f"\n4. Training KNN model with K={best_k}...")
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)


4. Training KNN model with K=3...


In [16]:
# Make predictions
y_pred = best_knn.predict(X_test_scaled)
y_prob = best_knn.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1

In [17]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"   • Test Accuracy: {accuracy:.4f}")

   • Test Accuracy: 0.9825


In [18]:
# Classification Report
print("\n5. Classification Report:")
report = classification_report(y_test, y_pred, target_names=['Malignant', 'Benign'])
print(report)


5. Classification Report:
              precision    recall  f1-score   support

   Malignant       1.00      0.95      0.98        42
      Benign       0.97      1.00      0.99        72

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [19]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Malignant', 'Benign'], 
            yticklabels=['Malignant', 'Benign'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for KNN')
plt.savefig('knn_confusion_matrix.png')
plt.close()

In [20]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve - KNN')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('knn_roc_curve.png')
plt.close()

In [21]:
print("\n6. Model Performance:")
print(f"   • Accuracy: {accuracy:.4f}")
print(f"   • ROC AUC: {roc_auc:.4f}")
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print(f"   • Sensitivity: {sensitivity:.4f}")
print(f"   • Specificity: {specificity:.4f}")


6. Model Performance:
   • Accuracy: 0.9825
   • ROC AUC: 0.9835
   • Sensitivity: 1.0000
   • Specificity: 0.9524


In [22]:
print("\nKNN Model Summary:")
print(f"• Best K value: {best_k}")
print(f"• Test Accuracy: {accuracy:.4f}")
print(f"• ROC AUC Score: {roc_auc:.4f}")
print("\nAll visualizations have been saved as PNG files.")


KNN Model Summary:
• Best K value: 3
• Test Accuracy: 0.9825
• ROC AUC Score: 0.9835

All visualizations have been saved as PNG files.


In [23]:
# Save the model (optional)
from joblib import dump
dump(best_knn, 'knn_model.joblib')
print("Model saved as 'knn_model.joblib'")

Model saved as 'knn_model.joblib'
