<a href="https://colab.research.google.com/github/edgeemer/hillel_ml_2025/blob/main/HW_3_KNN" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [208]:
from typing import List, Tuple

import numpy as np
import pandas as pd

!pip install -r /content/requirements.txt



In [209]:
def euclidean_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    """Compute Euclidean distance between two points.

    Args:
        point1 (np.ndarray): First point.
        point2 (np.ndarray): Second point.

    Returns:
        float: Euclidean distance between two points.
    """
    return np.sqrt(np.sum(np.power(point1 - point2, 2)))

In [210]:
class KNN:
    """K Nearest Neighbors classifier."""

    def __init__(self, k: int) -> None:
        """Initialize KNN with the number of neighbors to consider (k).

        Args:
            k (int): Number of neighbors to consider.
        """
        self._X_train = None
        self._y_train = None
        self.k = k  # number of neighbors to consider

    def fit(self, X_train: np.ndarray, y_train: np.ndarray) -> None:
        """Fit the KNN model with training data.

        Args:
            X_train (np.ndarray): Training data features.
            y_train (np.ndarray): Training data target.
        """
        self._X_train = X_train
        self._y_train = y_train

    def predict(self, X_test: np.ndarray, verbose: bool = False) -> np.ndarray:
        """Predict target values for test data.

        Args:
            X_test (np.ndarray): Test data features.
            verbose (bool, optional): Print progress during prediction. Defaults to False.

        Returns:
            np.ndarray: Predicted target values.
        """
        n = X_test.shape[0]
        y_pred = np.empty(n, dtype=self._y_train.dtype)

        for i in range(n):
            distances = np.array([euclidean_distance(x, X_test[i]) for x in self._X_train])
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self._y_train[k_indices]
            y_pred[i] = np.bincount(k_nearest_labels).argmax()

            if verbose:
                print(f"Predicted {i+1}/{n} samples", end="\r")

        if verbose:
            print("")
        return y_pred

In [211]:
def kfold_cross_validation(X: np.ndarray, y: np.ndarray, k: int) -> List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
    """Split dataset into k folds for cross-validation.

    Args:
        X (np.ndarray): Dataset features.
        y (np.ndarray): Dataset target.
        k (int): Number of folds.

    Returns:
        List[Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]: List of tuples (X_train, y_train, X_test, y_test).
    """
    n_samples = X.shape[0]
    fold_size = n_samples // k

    folds = []  # Container to store the results of each fold

    """
    If I understand correctly, all data points should be in validation at least
    once. Therefore, 1000/5 = 200 data points per fold =>
    X_test, y_test => validation dataset (200 points)
    X_train, y_train => training dataset (800 points)
    """

    val_start, val_end = 0, int(fold_size)

    for _ in range(k):

        folds.append(tuple([np.concatenate((X[:val_start], X[val_end:]), axis =0), \
                            np.concatenate((y[:val_start], y[val_end:]), axis =0), \
                            X[val_start:val_end], \
                            y[val_start:val_end]]))

        val_start+= fold_size
        val_end+= fold_size

    for fold_id, fold in enumerate(folds):
      print(f"\nfold #: {fold_id+1}")
      print(f"Train features shape: {fold[0].shape}")
      print(f"Train target shape: {fold[1].shape}")
      print(f"Test features shape: {fold[2].shape}")
      print(f"Test target shape: {fold[3].shape}")
      print("\n")
    return folds

    # Alternative version (relies on the non-included library, therefore was not
    # included). Still, addresses shuffling & generally faster.
    """
    from sklearn.model_selection import KFold

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    folds = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        folds.append((X_train, y_train, X_test, y_test))
    return folds
    """

In [212]:
def evaluate_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Compute accuracy score.

    Args:
        y_true (np.ndarray): True target values.
        y_pred (np.ndarray): Predicted target values.

    Returns:
        float: Accuracy score.
    """

    return np.sum(y_true == y_pred) / len(y_true)

In [213]:
def writing_to_readme(test_k_fold_values: dict):
  with open('/content/README.md', 'w') as f:
       f.write('# Accuracy Results for Different Values of k in KNN\n\n')
       f.write('| k | Accuracy |\n')
       f.write('|:--:|:--:|\n')

       for k, accuracy in test_k_fold_values.items():
           f.write(f'| {k} | {round(accuracy, 2)} |\n')

In [214]:
def main() -> None:
    """Main function to demonstrate the KNN classifier and k-fold cross-validation."""
    # Read training and testing data from CSV files
    # NOTE: data path, note that it must be specified relative to the \
    # directory from which you run this Python script
    training_data = pd.read_csv("/content/train.csv")[:1000]
    testing_data = pd.read_csv("/content/test.csv")

    # Extract features and target from the training data
    X = training_data.iloc[:, 1:].values
    y = training_data.iloc[:, 0].values
    print("Training data:", X.shape, y.shape)

    # Extract features and target from the testing data
    X_test = testing_data.iloc[:, 1:].values
    y_test = testing_data.iloc[:, 0].values
    print("Test data:", X_test.shape, y_test.shape)

    k = 4
    print(f" KNN with k = {k}")

    num_folds = 5
    # Perform k-fold cross-validation
    for X_train, y_train, X_val, y_val in kfold_cross_validation(X, y, k=num_folds):
        model = KNN(k=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val, verbose=True)
        accuracy = evaluate_accuracy(y_val, y_pred)
        print(f"Accuracy: {round(accuracy, 2)}\n")

    accuracy_k_fold_test = {}
    for k in [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 21, 40, 41]:
      model = KNN(k=k)
      model.fit(X, y)
      y_pred = model.predict(X_test, verbose=True)
      accuracy = evaluate_accuracy(y_test, y_pred)
      print(f"KNN with k = {k}")
      print(f"Accuracy: {round(accuracy, 2)}\n")

      accuracy_k_fold_test[k] = accuracy

In [215]:
# TODO: compute accuracy on test data and compare results with cross-validation scores
if __name__ == "__main__":
  main()

Training data: (1000, 784) (1000,)
Test data: (400, 784) (400,)
 KNN with k = 4

fold #: 1
Train features shape: (800, 784)
Train target shape: (800,)
Test features shape: (200, 784)
Test target shape: (200,)



fold #: 2
Train features shape: (800, 784)
Train target shape: (800,)
Test features shape: (200, 784)
Test target shape: (200,)



fold #: 3
Train features shape: (800, 784)
Train target shape: (800,)
Test features shape: (200, 784)
Test target shape: (200,)



fold #: 4
Train features shape: (800, 784)
Train target shape: (800,)
Test features shape: (200, 784)
Test target shape: (200,)



fold #: 5
Train features shape: (800, 784)
Train target shape: (800,)
Test features shape: (200, 784)
Test target shape: (200,)


Predicted 200/200 samples
Accuracy: 0.87

Predicted 200/200 samples
Accuracy: 0.89

Predicted 200/200 samples
Accuracy: 0.85

Predicted 200/200 samples
Accuracy: 0.85

Predicted 200/200 samples
Accuracy: 0.86

Predicted 400/400 samples
KNN with k = 3
Accuracy: 0.82