In [1]:
import math, copy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split

my_ID = 400132290
np.random.seed(my_ID)
np.set_printoptions(precision=2)# reduced display precision on numpy arrays

In [2]:
data = load_breast_cancer()
x_data = pd.DataFrame(data.data, columns=data.feature_names)
y_data = data.target
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=my_ID)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

print(f"X Shape: {x_train.shape}, X Type:{type(x_train)})")
#print(x_train)
print(f"y Shape: {y_train.shape}, y Type:{type(y_train)})")
#print(y_train)

X Shape: (455, 30), X Type:<class 'numpy.ndarray'>)
y Shape: (455,), y Type:<class 'numpy.ndarray'>)


In [None]:
def predict_knn(k, x_train_mat, x_test_mat, y_train_mat):
    predictions = []

    # For each row in the test set, calculate the distance between it and each row in the training set,
    # and pick the K rows with the smallest distances between them and the test row
    for x in x_test_mat:
        distances = np.linalg.norm(x_train_mat - x, axis=1)
        neighbours = np.argsort(distances, kind='stable')[0:k]
        classes = list(y_train_mat[neighbours])
        predictions.append(max(set(classes), key=classes.count))

    return predictions


def perform_cross_validation(k, kf, x_train, y_train):
    cross_valid_score = 0.0

    # Run through all num K_FOLDS cross-validation
    for train, test in kf.split(x_train):
        x_train_mat, x_test_mat = x_train[train], x_train[test]
        y_train_mat, y_test_mat = y_train[train], y_train[test]

        predictions = predict_knn(k, x_train_mat, x_test_mat, y_train_mat)

        cross_valid_score += calc_misclassification_rate(predictions, y_test_mat)

    # Average final cross-validation error
    return cross_valid_score / kf.n_splits

def calc_misclassification_rate(predictions, y_test_mat):
    return np.mean(predictions != y_test_mat)

def main():
    K_FOLD = 5
    kf = KFold()

if __name__ == '__main__':
    main()