In [4]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import random

class KNN:
    """K-Nearest Neighbors Classifier Implementation."""
    def __init__(self, k=3):
        # Initialize the number of neighbors to consider
        self.k = k
        # Variables to store the training data
        self.X_train = None
        self.y_train = None

    def euclidean_distance(self, point1, point2):
        """Calculates the Euclidean distance between two points."""
        return np.sqrt(np.sum((point1 - point2) ** 2))

    def fit(self, X, y):
        """Stores the training data (lazy learning)."""
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def _predict_single_point(self, point):
        """Predicts the class label for a single data point."""

        distances = []
        # Calculate distance from the new point to ALL training points
        for i, training_point in enumerate(self.X_train):
            dist = self.euclidean_distance(point, training_point)
            distances.append((dist, self.y_train[i])) # Store (distance, label)

        # Sort by distance (ascending) and get the k nearest neighbors
        distances.sort()
        k_neighbors = distances[:self.k]

        # Extract the labels of the k neighbors
        neighbor_labels = [label for _, label in k_neighbors]

        # Majority Vote: find the most common label
        vote = Counter(neighbor_labels)
        prediction = vote.most_common(1)[0][0]

        # Return both the prediction and the neighbors (used for visualization/details)
        return prediction, k_neighbors

    def predict(self, X):
        """Predicts class labels for a set of test points."""
        predictions = []
        for point in X:
            # Call the single point prediction logic
            pred, _ = self._predict_single_point(point)
            predictions.append(pred)
        return np.array(predictions)

    def calculate_accuracy(self, X_test, y_test):
        """Calculates the classification accuracy of the model."""
        predictions = self.predict(X_test)
        correct_predictions = np.sum(predictions == y_test)
        accuracy = correct_predictions / len(y_test)
        return accuracy

# --- AUXILIARY AND EXAMPLE FUNCTIONS ---

def generate_sample_data():
    """Generates a 2-feature, 2-class synthetic dataset."""
    np.random.seed(42)

    # Class 0 data
    class_0_x = np.random.normal(2, 0.5, 30)
    class_0_y = np.random.normal(1, 0.3, 30)
    class_0 = np.column_stack((class_0_x, class_0_y))
    labels_0 = np.zeros(30)

    # Class 1 data
    class_1_x = np.random.normal(5, 0.7, 30)
    class_1_y = np.random.normal(3, 0.5, 30)
    class_1 = np.column_stack((class_1_x, class_1_y))
    labels_1 = np.ones(30)

    X = np.vstack((class_0, class_1))
    y = np.hstack((labels_0, labels_1))
    return X, y

def split_data(X, y, train_percentage=0.8):
    """Splits data into shuffled training and testing sets."""
    indices = list(range(len(X)))
    random.shuffle(indices)

    limit = int(len(X) * train_percentage)
    train_indices = indices[:limit]
    test_indices = indices[limit:]

    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    return X_train, X_test, y_train, y_test

def visualize_results(X_train, y_train, X_test, y_test, predictions, knn, new_point=None):
    """Visualizes the data, predictions, and neighbors (structure only)."""
    plt.figure(figsize=(12, 5))

    # ... Detailed Matplotlib plotting logic is omitted here for brevity
    # but the function call preserves the original program flow ...

    # Placeholder for plot showing training data, test data, and neighbor lines
    print("Plotting results... (Requires a graphical environment)")
    # plt.tight_layout()
    # plt.show()


def run_detailed_example():
    """Executes the full KNN analysis pipeline."""
    print("=== K-NEAREST NEIGHBORS (KNN) ANALYSIS ===\n")

    # 1. Generate and split data
    X, y = generate_sample_data()
    print(f"1. Data Generated. Total Samples: {len(X)}")
    X_train, X_test, y_train, y_test = split_data(X, y)
    print(f"   Training Set: {len(X_train)}, Testing Set: {len(X_test)}")

    # 2. Test different k values to find the best one
    best_k_results = []
    for k in range(1, 11):
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        accuracy = knn.calculate_accuracy(X_test, y_test)
        best_k_results.append((k, accuracy))

    best_k = max(best_k_results, key=lambda x: x[1])
    print(f"\n2. Best k found: {best_k[0]} (Accuracy: {best_k[1]:.3f})")

    # 3. Create and test the final model
    knn_final = KNN(k=best_k[0])
    knn_final.fit(X_train, y_train)
    predictions = knn_final.predict(X_test)

    # 4. Individual prediction test
    print("\n3. Individual Prediction Test:")
    new_point = np.array([3.5, 2.0])
    prediction, neighbors = knn_final._predict_single_point(new_point)

    print(f"   New Point: [Length={new_point[0]:.2f}, Width={new_point[1]:.2f}] -> Prediction: Class {int(prediction)}")
    print(f"   {best_k[0]} Nearest Neighbors:")
    for i, (dist, label) in enumerate(neighbors):
        print(f"     {i+1}. Distance: {dist:.3f}, Class: {int(label)}")

    # 5. Visualisation
    print("\n4. Running Visualisation...")
    visualize_results(X_train, y_train, X_test, y_test, predictions, knn_final, new_point)

if __name__ == "__main__":
    run_detailed_example()


=== K-NEAREST NEIGHBORS (KNN) ANALYSIS ===

1. Data Generated. Total Samples: 60
   Training Set: 48, Testing Set: 12

2. Best k found: 1 (Accuracy: 1.000)

3. Individual Prediction Test:
   New Point: [Length=3.50, Width=2.00] -> Prediction: Class 1
   1 Nearest Neighbors:
     1. Distance: 0.969, Class: 1

4. Running Visualisation...
Plotting results... (Requires a graphical environment)


<Figure size 1200x500 with 0 Axes>