In [2]:
import math, copy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import PrecisionRecallDisplay, f1_score, mean_absolute_error, mean_squared_error

my_ID = 400132290
np.random.seed(my_ID)
np.set_printoptions(precision=2)# reduced display precision on numpy arrays

In [3]:
data = pd.read_csv(r"C:\Users\zhang\Downloads\BostonHousing.csv")
X = data.iloc[:, :-1]
y = data.iloc[:,-1].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=my_ID)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
print(f"X Shape: {x_train.shape}, X Type:{type(x_train)})")
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")

X Shape: (404, 13), X Type:<class 'numpy.ndarray'>)
y Shape: (404,), y Type:<class 'numpy.ndarray'>)


data = load_breast_cancer()
x_data = pd.DataFrame(data.data, columns=data.feature_names)
y_data = data.target
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=my_ID)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

print(f"X Shape: {x_train.shape}, X Type:{type(x_train)})")
#print(x_train)
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")
#print(y_train)

In [5]:
def predict_knn(k, x_train_mat, x_test_mat, y_train_mat):
    predictions = []

    # For each row in the test set, calculate the distance between it and each row in the training set,
    # and pick the K rows with the smallest distances between them and the test row
    for x in x_test_mat:
        distances = np.linalg.norm(x_train_mat - x, axis=1)
        neighbours = np.argsort(distances, kind='stable')[0:k]
        classes = list(y_train_mat[neighbours])
        predictions.append(max(set(classes), key=classes.count))

    return predictions


def perform_cross_validation(k, kf, x_train, y_train):
    cross_valid_score = 0.0

    # Run through all num K_FOLDS cross-validation
    for train, test in kf.split(x_train):
        x_train_mat, x_test_mat = x_train[train], x_train[test]
        y_train_mat, y_test_mat = y_train[train], y_train[test]

        predictions = predict_knn(k, x_train_mat, x_test_mat, y_train_mat)

        cross_valid_score += mean_squared_error(predictions, y_test_mat)

    # Average final cross-validation error
    return cross_valid_score / kf.n_splits


def main():
    K_FOLD = 4
    kf = KFold(n_splits= K_FOLD)
    """k-nearest neighbour classifier"""
    cv_scores = []
    for k in range(1, K_FOLD+1):  # number of neighbours
        # My implementation
        cv_score = perform_cross_validation(k, kf, x_train, y_train)
        #cv_scores.append(cv_score)
        print(f"For k={k}, the cross-validation error was: {cv_score}")

        # Scikit learn implementation
        knn_clf = KNeighborsRegressor(n_neighbors=k)
        skl_cv_score = -cross_val_score(knn_clf, x_train, y_train, cv=7, scoring="neg_mean_squared_error").mean()
        cv_scores.append(skl_cv_score)
        print(f"The sklearn cv error was: {skl_cv_score}")

    print()
    best_k = np.argmin(cv_scores) + 1
    print(f"The best model was at k={best_k}")

    # My implementation
    predictions = predict_knn(best_k, x_train, x_test, y_train)

    best_misclass_rate = mean_squared_error(predictions, y_test)
    print(f"The test error for my implementation is: {best_misclass_rate}")

    # Scikit learn implementation
    knn_clf = KNeighborsRegressor(n_neighbors=best_k)
    knn_clf.fit(x_train, y_train)

    predictions = knn_clf.predict(x_test)

    best_misclass_rate = mean_squared_error(y_test, predictions)
    print(f"The test error for the sklearn implementation is: {best_misclass_rate}")


if __name__ == '__main__':
    main()

For k=1, the cross-validation error was: 22.37542079207921
The sklearn cv error was: 20.726271713767176
For k=2, the cross-validation error was: 25.917252475247523
The sklearn cv error was: 16.75681639443436
For k=3, the cross-validation error was: 37.25212871287128
The sklearn cv error was: 17.628196352951342
For k=4, the cross-validation error was: 38.06576732673267
The sklearn cv error was: 19.559277396076396

The best model was at k=2
The test error for my implementation is: 26.24931372549019
The test error for the sklearn implementation is: 19.325539215686273
