Loading libraries and data, splitting the dataset into data and labels.

In [1]:
%matplotlib inline
import numpy as np
from sklearn import datasets
from collections import Counter

iris = datasets.load_iris()
iris_data = iris.data
iris_labels = iris.target

Defining the functions: Euclidean distance, voting for a result (with and without probability)

In [2]:
def distance(instance1, instance2):
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)
    return np.linalg.norm(instance1 - instance2)

def get_neighbors(training_set, labels, test_instance, k, distance=distance):
    distances = []
    for index in range(len(training_set)):
        dist = distance(test_instance, training_set[index])
        distances.append((training_set[index], dist, labels[index]))
    distances.sort(key=lambda x: x[1])
    neighbors = distances[:k]
    return neighbors

def vote(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        class_counter[neighbor[2]] += 1
    return class_counter.most_common(1)[0][0]

def vote_prob(neighbors):
    class_counter = Counter()
    for neighbor in neighbors:
        class_counter[neighbor[2]] += 1
    labels, votes = zip(*class_counter.most_common())
    winner = class_counter.most_common(1)[0][0]
    votes4winner = class_counter.most_common(1)[0][1]
    return winner, votes4winner/sum(votes)

Vote a result with the test data [4.8,2.5,5.3,2.4]

In [3]:
test_data = [4.8,2.5,5.3,2.4]

neighbors = get_neighbors(iris_data, iris_labels, test_data, 5, distance=distance)
vote(neighbors)

2

Vote a result with the test data [4.8,2.5,5.3,2.4], but now with probability

In [4]:
neighbors = get_neighbors(iris_data, iris_labels, test_data, 5, distance=distance)
vote_prob(neighbors)

(2, 1.0)

Alternative: Significantly shorter variant with greater use of the sklearn library (including normalization)

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler

X_train = iris_data
y_train = iris_labels
X_test = [test_data]

min_max_scaler = MinMaxScaler()
min_max_scaler.fit_transform(X_train)

X_train = min_max_scaler.transform(X_train)
X_test = min_max_scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred

array([2])