# Lab 10: K-Nearest Neighbors (KNN) Classifier

**Student Name:** Muhammad Haadhee Sheeraz Mian  
**Reg No:** 478359

This notebook implements the K-Nearest Neighbors algorithm following the lab manual.

## 3) Preprocessing in Code
### Dimensionality Reduction (Optional)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
import pandas as pd

# Sample Data
data = {
    'Age': [25, 35, 45, 55, 40, 60],
    'Income': [35000, 70000, 90000, 120000, 45000, 100000]
}
df = pd.DataFrame(data)

# Step 1: Feature Scaling
scaler = StandardScaler()  # Alternatively, use MinMaxScaler()
df_scaled = scaler.fit_transform(df)

# Step 2: Dimensionality Reduction (Optional)
pca = PCA(n_components=2)
df_reduced = pca.fit_transform(df_scaled)

print("Scaled Data:\n", df_scaled)
print("\nReduced Data:\n", df_reduced)

## 4) Implementing KNN from Scratch
### Step 1 – Calculate Distance

In [None]:
import math

# Function to calculate Euclidean distance
def euclidean_distance(point1, point2):
    return math.sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2)))

### Step 2 – Find the K Nearest Neighbors

In [None]:
# Function to find k nearest neighbors
def get_k_nearest_neighbors(training_data, new_point, k):
    distances = []
    
    # Calculate distance from new_point to each point in the training_data
    for data_point in training_data:
        distance = euclidean_distance(new_point[:-1], data_point[:-1])  # Ignore labels
        distances.append((distance, data_point))
    
    # Sort by distance and select the k closest points
    distances.sort(key=lambda x: x[0])
    neighbors = [data[1] for data in distances[:k]]
    return neighbors

### Step 3 – Make the Prediction

In [None]:
from collections import Counter

# Function to predict class label based on k nearest neighbors
def predict_classification(training_data, new_point, k):
    neighbors = get_k_nearest_neighbors(training_data, new_point, k)
    
    # Extract labels and find the most common class among neighbors
    labels = [neighbor[-1] for neighbor in neighbors]
    most_common = Counter(labels).most_common(1)
    return most_common[0][0]

### Full KNN Function for Classification

In [None]:
# Full KNN Function for Classification
def knn_classify(training_data, new_point, k):
    # Step 1: Get the k nearest neighbors
    neighbors = get_k_nearest_neighbors(training_data, new_point, k)
    
    # Step 2: Predict the most common class among neighbors
    labels = [neighbor[-1] for neighbor in neighbors]
    prediction = Counter(labels).most_common(1)[0][0]
    return prediction

## 5) Using Scikit-Learn for KNN
### Loading a Sample Dataset

In [None]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target
data['target'] = data['target'].map({0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'})

# Display first few rows
data.head()

### Preprocessing Data with Scikit-Learn

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the feature data
scaled_features = scaler.fit_transform(iris.data)

# Now scaled_features is ready for KNN
print("Scaled Features shape:", scaled_features.shape)

### Implementing KNN with Scikit-Learn

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_features, iris.target, test_size=0.3, random_state=42)

# Initialize the KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the model
knn.fit(X_train, y_train)

print("Model trained successfully!")

### Making Predictions

In [None]:
# Predict on the test set
y_pred = knn.predict(X_test)

# Display predictions
print("Predicted labels:", y_pred)
print("Actual labels:   ", y_test)

### Evaluating Model Accuracy

In [None]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

### Visualizing the Decision Boundaries

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Select only petal length and petal width for simplicity
X = iris.data[:, 2:4]
y = iris.target

# Fit KNN model on the simplified dataset
knn_2d = KNeighborsClassifier(n_neighbors=3)
knn_2d.fit(X, y)

# Create a mesh grid for plotting
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

# Predict classes for each point in the mesh grid
Z = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary and data points
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', cmap=plt.cm.coolwarm)
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.title("Decision Boundary of KNN Classifier")
plt.show()

## 6) Hyperparameter Tuning
### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the model
knn = KNeighborsClassifier()

# Define parameter grid
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

# Initialize GridSearchCV with the parameter grid and cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

### Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_neighbors': randint(1, 20),  # Random integer between 1 and 20
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize RandomizedSearchCV with 100 iterations
random_search = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42)

# Fit the randomized search on the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters:", random_search.best_params_)
print(f"Best cross-validation score: {random_search.best_score_:.4f}")

## 7) Mini Challenge
### Iris Dataset Classification with KNN

**Task Overview:**
- Classify flowers into three species: Setosa, Versicolor, and Virginica
- Preprocess the data
- Implement KNN model
- Perform hyperparameter tuning with Grid Search
- Visualize results
- **Bonus:** Apply PCA for dimensionality reduction

### Step 1: Load and Explore the Dataset

In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create DataFrame for visualization
df_iris = pd.DataFrame(X, columns=iris.feature_names)
df_iris['species'] = pd.Categorical.from_codes(y, iris.target_names)

print("Dataset Shape:", df_iris.shape)
print("\nFirst 5 rows:")
print(df_iris.head())
print("\nDataset Description:")
print(df_iris.describe())
print("\nClass Distribution:")
print(df_iris['species'].value_counts())

### Step 2: Visualize the Data

In [None]:
# Pairplot
sns.pairplot(df_iris, hue='species', height=2.5)
plt.suptitle('Iris Dataset Pairplot', y=1.02)
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_iris.iloc[:, :-1].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

### Step 3: Preprocess the Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the dataset (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])
print("\nFeatures scaled successfully!")

### Step 4: Implement KNN and Test Different K Values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Test different values of K
k_values = range(1, 31)
train_scores = []
test_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    
    train_scores.append(knn.score(X_train_scaled, y_train))
    test_scores.append(knn.score(X_test_scaled, y_test))

# Find the best K
best_k = k_values[np.argmax(test_scores)]
print(f"Best K value: {best_k}")
print(f"Best test accuracy: {max(test_scores):.4f}")

### Step 5: Visualize Accuracy vs K

In [None]:
# Plot training and testing accuracy
plt.figure(figsize=(12, 6))
plt.plot(k_values, train_scores, 'bo-', label='Training Accuracy', linewidth=2)
plt.plot(k_values, test_scores, 'rs-', label='Testing Accuracy', linewidth=2)
plt.axvline(x=best_k, color='green', linestyle='--', label=f'Best K={best_k}')
plt.xlabel('Number of Neighbors (K)', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('KNN Accuracy vs K Value', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Step 6: Train Final Model and Evaluate

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Train final model with best K
knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn_final.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

### Step 7: Confusion Matrix

In [None]:
# Create and visualize confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix (K={best_k})')
plt.tight_layout()
plt.show()

### Step 8: Hyperparameter Tuning with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform Grid Search
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

# Test the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Test Accuracy with Best Parameters: {best_accuracy:.4f}")

### Step 9: Visualize Decision Boundaries (2D)

In [None]:
# Use only 2 features for visualization
X_2d = X[:, [2, 3]]  # Petal length and petal width
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(X_2d, y, test_size=0.2, random_state=42, stratify=y)

# Scale 2D features
scaler_2d = StandardScaler()
X_train_2d_scaled = scaler_2d.fit_transform(X_train_2d)
X_test_2d_scaled = scaler_2d.transform(X_test_2d)

# Train KNN on 2D data
knn_2d = KNeighborsClassifier(n_neighbors=best_k)
knn_2d.fit(X_train_2d_scaled, y_train_2d)

# Create mesh grid
h = 0.02
x_min, x_max = X_train_2d_scaled[:, 0].min() - 1, X_train_2d_scaled[:, 0].max() + 1
y_min, y_max = X_train_2d_scaled[:, 1].min() - 1, X_train_2d_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict on mesh
Z = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.4, cmap='viridis')
scatter = plt.scatter(X_train_2d_scaled[:, 0], X_train_2d_scaled[:, 1],
                     c=y_train_2d, cmap='viridis', edgecolors='black', s=50)
plt.xlabel('Petal Length (scaled)')
plt.ylabel('Petal Width (scaled)')
plt.title(f'KNN Decision Boundary (K={best_k})')
plt.colorbar(scatter)
plt.tight_layout()
plt.show()

print(f"2D Model Accuracy: {knn_2d.score(X_test_2d_scaled, y_test_2d):.4f}")

### BONUS: Apply PCA for Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.4f}")

# Train KNN on PCA-reduced data
knn_pca = KNeighborsClassifier(n_neighbors=best_k)
knn_pca.fit(X_train_pca, y_train)

# Evaluate PCA model
y_pred_pca = knn_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)

print(f"\nAccuracy WITHOUT PCA: {accuracy:.4f}")
print(f"Accuracy WITH PCA: {accuracy_pca:.4f}")
print(f"\nClassification Report (PCA):\n")
print(classification_report(y_test, y_pred_pca, target_names=iris.target_names))

### Visualize PCA Results

In [None]:
# Plot PCA decision boundary
h = 0.02
x_min, x_max = X_train_pca[:, 0].min() - 1, X_train_pca[:, 0].max() + 1
y_min, y_max = X_train_pca[:, 1].min() - 1, X_train_pca[:, 1].max() + 1
xx_pca, yy_pca = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z_pca = knn_pca.predict(np.c_[xx_pca.ravel(), yy_pca.ravel()])
Z_pca = Z_pca.reshape(xx_pca.shape)

plt.figure(figsize=(10, 8))
plt.contourf(xx_pca, yy_pca, Z_pca, alpha=0.4, cmap='viridis')
scatter = plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1],
                     c=y_train, cmap='viridis', edgecolors='black', s=50)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title(f'KNN Decision Boundary with PCA (K={best_k})')
plt.colorbar(scatter)
plt.tight_layout()
plt.show()