# Метод k-ближайших соседей.

In [472]:
from sklearn import datasets as dts
from sklearn.model_selection import train_test_split
import numpy as np
import random
from collections import Counter
import timeit
import numba

*Разбиваем датасет на две части: для теста и для данных проверки*

In [473]:
@numba.njit
def split_dataset(data):
    len_mass = len(data.data) # длина массива данных
    mass_sort = [i for i in range(len_mass)] # массив индексов
    random.shuffle(mass_sort) # Рандомно сортируем индексы

    len_determine = int(len_mass * 0.7)
    # list_determine = [len_mass - len_determine, len_determine]
    datasets_learn = [[data.data[i], data.target[i]] for i in mass_sort[:len_determine]] # Первые 70 процентов добавляем для обучение,
    datasets_test = [[data.data[i], data.target[i]] for i in mass_sort[len_determine:]] # остальные для теста.
    return datasets_learn, datasets_test


*Разбиваем датасет на две части: для теста и для данных проверки*

*Нормализуем данные*

In [474]:
#@numba.njit
def standartization(X):
    return (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))
    # return (X - X.min()) / (X.max() - X.min())


*Метрики*

In [475]:
# Вход el1 = [el1, el2, ... , eln]
# def l1(el1, el2):
#     return sum([abs(k - v) for k, v in list(zip(el1, el2))])

# def l2(el1, el2):
#     return sum([(k - v)**2 for k, v in list(zip(el1, el2))]) ** 0.5
@numba.njit
def l1(el1, el2): # manhattan
    sum_el = 0
    for k, v in list(zip(el1, el2)):
        sum_el += abs(k - v)
    return sum_el

@numba.njit
def l2(el1, el2): # euclidean
    sum_el = 0
    for k, v in list(zip(el1, el2)):
        sum_el += (k - v) ** 2
    return sum_el ** 0.5

@numba.njit
def l_infinity(el1, el2): # chebyshev
    max_len = 0
    for k, v in list(zip(el1, el2)):
        if abs(k - v) > max_len:
            max_len = abs(k - v)
    return max_len
    # return max([abs(k - v) for k, v in list(zip(el1, el2))])

*Метод k-ближайших соседей*

In [476]:
def choice_class(mass_n):
    count = Counter(mass_n)
    return max(count, key = count.get)

def k_nearest_neightbors(x_train, x, y_train, metric = l1, n = 4):
    neightbors = np.array([metric(x, y) for y in x_train]).argsort()[:n]
    rez = choice_class([y_train[j] for j in neightbors])
    return rez

In [477]:
@numba.njit
def check_forecast(y_test, forecast):
    target_rez = len(y_test)
    forecast_rez = 0
    for i in range(len(y_test)):
        if y_test[i] == forecast[i]:
            forecast_rez += 1

    return forecast_rez / target_rez


In [478]:
def main():
    for i, name_dts in [(dts.load_iris, "Iris plants dataset"),
                        (dts.load_digits, "Optical recognition of handwritten digits dataset"),
                        (dts.load_wine,"Wine recognition dataset"),
                        (dts.load_breast_cancer, "Breast cancer wisconsin (diagnostic) dataset")]:

        data = i()

        x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, train_size = 0.7, random_state = 42)

        x_train = standartization(x_train)
        x_test = standartization(x_test)

        greatest_rez = [0, 0, 0]

        for n in range(4, 10):
            for m in [l1, l2, l_infinity]:
                rez = np.array(range(x_test.shape[0]))
                for i in range(x_test.shape[0]):
                    rez[i] = k_nearest_neightbors(x_train, x_test[i], y_train, m, n) == y_test[i] if 1 else 0

                answer = sum(rez) / y_test.shape[0] # scheck_forecast(y_test, rez)
                if greatest_rez[0] < answer:
                    greatest_rez = [answer, m, n]
                # logreg.predict(X_test) # Прогнозируемые данные
        print(f"Обучение на данных {name_dts} с метрикой {greatest_rez[1].__name__} для {greatest_rez[2]} ближайших соседей: {greatest_rez[0]}")

In [479]:
if __name__ == "__main__":
    main()

Обучение на данных Iris plants dataset с метрикой l2 для 7 ближайших соседей: 1.0


  return (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))


Обучение на данных Optical recognition of handwritten digits dataset с метрикой l_infinity для 5 ближайших соседей: 0.9777777777777777
Обучение на данных Wine recognition dataset с метрикой l1 для 4 ближайших соседей: 0.9814814814814815
Обучение на данных Breast cancer wisconsin (diagnostic) dataset с метрикой l2 для 7 ближайших соседей: 0.9649122807017544
