In [1]:
# Import libraries
from sklearn import datasets  #datasets from Sci-kit Learn's library
from sklearn import neighbors #Sci-kit Learn's knn implementation
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from math import dist
from statistics import mode

# My Implementation

In [2]:
def KNearestNeighbor(train, trainLabels, test, k):
    """
    train: training data with feature values
    trainLabels: corresponding class labels for training data
    test: a novel query point that will be classified
    k: number of neighbors
    
    returns: a class label based on the plurality of the k closest points
    """
        
    k_closest = {} #K: index, V: euclidean distance to novel point
    
    for i in range(len(train)):
        if len(k_closest) < k: #populate dictionary until we reach capacity
            k_closest[i] = dist(train[i],test)
        else: #check if we need to remove/add from dictionary
            curr_dist = dist(train[i],test)
            largest = max(k_closest.values())
            
            if curr_dist < largest: #we need to remove largest from dictionary
                indices = k_closest.keys()
                del_key = -1
                for j in indices:
                    if k_closest[j] == largest:
                        del_key = j
                        
                del k_closest[del_key] #remove farthest point
                k_closest[i] = curr_dist #add new point
        
    #map from indices to class labels
    k_neighbors = k_closest.keys()
    prediction_labels = []
    for i in k_neighbors:
        prediction_labels.append(trainLabels[i])

    return mode(prediction_labels)

In [3]:
def testKNearestNeighbor(train, trainLabels, test, testLabels, k):
    """
    train: training data with feature values
    trainLabels: corresponding class labels for training data
    test: a novel query point that will be classified
    testLabels: corresponding class labels for test data
    k: number of neighbors
    
    returns: the fraction of test points classified correctly
    """
    correct_answers = 0
    
    for i in range(len(test)):
        prediction_label = KNearestNeighbor(train, trainLabels, test[i], k)
        if prediction_label == testLabels[i]:
            correct_answers += 1
    
    return correct_answers / len(test)

# Load in data and evaluate models

In [4]:
#load in iris dataset
iris = datasets.load_iris()

In [5]:
#Split data into train and test points
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)
for i in range(1,12,2):
    clf = neighbors.KNeighborsClassifier(metric="euclidean",n_neighbors=i)
    clf.fit(X_train, y_train)
    print(f"Number of neighbors: {i}")
    print(f"Scikit implementation: {clf.score(X_test, y_test)}")
    print(f"My implementation: {testKNearestNeighbor(X_train, y_train, X_test, y_test,i)} \n")

Number of neighbors: 1
Scikit implementation: 0.9111111111111111
My implementation: 0.9111111111111111 

Number of neighbors: 3
Scikit implementation: 0.9111111111111111
My implementation: 0.9111111111111111 

Number of neighbors: 5
Scikit implementation: 0.9111111111111111
My implementation: 0.9111111111111111 

Number of neighbors: 7
Scikit implementation: 0.9555555555555556
My implementation: 0.9555555555555556 

Number of neighbors: 9
Scikit implementation: 0.9555555555555556
My implementation: 0.9555555555555556 

Number of neighbors: 11
Scikit implementation: 0.9555555555555556
My implementation: 0.9555555555555556 



# Hyperparameter Grid Search

In [6]:
def runTuneTest(learner, parameters, X,y):
    """
    This function takes a base learner, parameters, and a data set to create and tune a model
    
    Input: base learner object, dictionary of hyper parameters to tune, data, target
    Return: N/a
    """
    
    #Divide data into training/test splits using StratifiedKFold
    accuracy = []
    skf = StratifiedKFold(n_splits=5,shuffle=True)
    fold = 1
    for train,test in skf.split(X,y):
        clf = GridSearchCV(learner,parameters,cv=3)
        clf.fit(X[train],y[train])
        accuracy.append(clf.score(X[test],y[test]))
        printFold(fold,clf.best_params_,clf.best_score_)
        fold += 1

In [7]:
def printFold(num,params,score):
    print(f"Fold: {num}")
    print(f"Best parameters: {params}")
    print(f"Tuning Set Score: {score:.3f}\n")

In [8]:
knn_params = {"weights":["uniform","distance"], "n_neighbors":[1,3,5,7,9,11]}
clf = neighbors.KNeighborsClassifier()
runTuneTest(clf,knn_params, iris.data, iris.target)

Fold: 1
Best parameters: {'n_neighbors': 5, 'weights': 'uniform'}
Tuning Set Score: 0.967

Fold: 2
Best parameters: {'n_neighbors': 3, 'weights': 'uniform'}
Tuning Set Score: 0.992

Fold: 3
Best parameters: {'n_neighbors': 9, 'weights': 'distance'}
Tuning Set Score: 0.992

Fold: 4
Best parameters: {'n_neighbors': 3, 'weights': 'uniform'}
Tuning Set Score: 0.975

Fold: 5
Best parameters: {'n_neighbors': 3, 'weights': 'uniform'}
Tuning Set Score: 0.975

