In [2]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import sklearn.datasets as dts
import numba
# Modules
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as dts
from sklearn import preprocessing

In [3]:
class KMeansForClassters:

    def __init__(self, n_clusters, max_iter=300, random_state=42, metric="l1"):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = np.random.RandomState(random_state)
        self.order = 2
        if metric == "l1":
            self.order = 1
        elif metric == "l2":
            pass
        elif metric == "infinity":
            self.order = np.inf

    # # *Метрики*
    # # Вход el1 = [el1, el2, ... , eln]
    # @numba.njit
    # def l1(el1, el2):
    #     return sum([abs(k - v) for k, v in list(zip(el1, el2))])

    # @numba.njit
    # def l2(el1, el2):
    #     return sum([(k - v)**2 for k, v in list(zip(el1, el2))]) ** 0.5

    # @numba.njit
    # def l_infinity(el1, el2):
    #     return max([abs(k - v) for k, v in list(zip(el1, el2))])

    # Выбираем центры класстеров в самом начале
    def _init_centroids(self, X):
        # Задаем значение для генерации рандомного числа
        random_idx = self.random_state.permutation(X.shape[0]) # создаем массив раномных значений
        centroids = X[random_idx[:self.n_clusters]] # выбираем случайные точки (кол-во - заданные кластеры)
        return centroids

    # Выбираем оптимальный центры класстеров
    def _find_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

   # Находим квадраты расстояний до цетров
    def _find_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            distance[:, k] = np.square(norm(X - centroids[k, :], axis=1, ord=self.order))
        return distance

    def _find_sse(self, X, labels, centroids):
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1, ord=self.order)
        return np.sum(np.square(distance))

    def fit(self, X):
        self.centroids = self._init_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self._find_distance(X, old_centroids)
            self.labels = np.argmin(distance, axis=1)
            self.centroids = self._find_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.sse = self._find_sse(X, self.labels, self.centroids)

    def predict(self, X):
        distance = self._find_distance(X, self.centroids)
        return np.argmin(distance, axis=1)

In [4]:
for load_dts, name_dts in [(dts.load_iris, "Iris plants dataset"),
                            (dts.load_digits, "Optical recognition of handwritten digits dataset"),
                            (dts.load_wine, "Wine recognition dataset"),
                            (dts.load_breast_cancer, "Breast cancer wisconsin (diagnostic) dataset")]:

    print(name_dts, ":", sep="")

    data = load_dts(); # Загружаем датасеты

    # Разделяем данные
    X = data.data

    # Стандартизируем
    X = preprocessing.MinMaxScaler().fit_transform(X)
    df = pd.DataFrame(X, columns = range(0, len(X[0])))

    # Выбираем кол-во кластеров
    if "Iris" in name_dts or "Wine" in name_dts:
        k = 3
    elif "Optical" in name_dts:
        k = 10
    else:
        k = 2

    # Обучаем модель
    model = KMeansForClassters(n_clusters=k, max_iter=300, metric="l2")
    model.fit(X)

    clusters = model.predict(X)
    # print(model.sse)

    # Смотрим точноть (Среднеквадратичную ошибку)
    from scipy.stats import mode
    labels = np.zeros_like(clusters)
    for i in range(10):
        mask = (clusters == i)
        labels[mask] = mode(data.target[mask])[0]

    from sklearn.metrics import accuracy_score
    print("accuracy_score: ", accuracy_score(data.target, labels))


Iris plants dataset:
accuracy_score:  0.8866666666666667
Optical recognition of handwritten digits dataset:
accuracy_score:  0.6499721758486366
Wine recognition dataset:
accuracy_score:  0.9438202247191011
Breast cancer wisconsin (diagnostic) dataset:
accuracy_score:  0.9279437609841827


In [5]:
def test():
    [x for x in X if min > x ]