In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
from time import time

In [4]:
os.chdir('..')

In [5]:
from miscellaneous import centering
from clustering import Kmeans, RandomSwap
from experiment import AlgorithmPipeline

In [6]:
from generation import generate_set

In [7]:
%%time
X, Y = generate_set(1000, 10, 5, 100, 0.6)

Wall time: 0 ns


In [8]:
%%time
km_pipe = AlgorithmPipeline(data = centering(X),
                            algorithm=Kmeans)

Wall time: 0 ns


In [10]:
%%time
L = km_pipe.run(k_range = np.arange(2,31),
                exec_number=30,
                max_iter = 100)

Wall time: 11.5 s


In [11]:
from metrics import bss_matrix, wss_matrix
%time SSW = wss_matrix(centering(X), L, 'conventional')
%time SSB = bss_matrix(centering(X), L, 'conventional')

Wall time: 611 ms
Wall time: 956 ms


In [32]:
from metrics import calinski_harabasz_matrix, elbow, hartigan, wb_index_matrix, xu_index_matrix, silhouette_matrix, silhouette

In [34]:
def silhouette_wss(X: np.ndarray, L: np.ndarray, SSW: np.ndarray, method: str = 'mean', aggregation=np.argmin):
    assert aggregation is not None
    chosen_partitions = L[(np.arange(0, L.shape[0]), aggregation(SSW, axis=1))]
    silhouettes = np.zeros((L.shape[0],))
    for i in range(L.shape[0]):
        silhouettes[i] = silhouette(X, chosen_partitions[i])
    return silhouettes

In [36]:
%time silhouette_wss(X, L, SSW)

Wall time: 8.25 s


array([0.49729098, 0.58480058, 0.66156021, 0.72061407, 0.39431064,
       0.38957412, 0.24722469, 0.31098415, 0.26112217, 0.16465123,
       0.10697926, 0.10274163, 0.10470095, 0.10370832, 0.17426479,
       0.09687367, 0.09763388, 0.09293794, 0.09550588, 0.09495478,
       0.09347024, 0.09479708, 0.090288  , 0.08933027, 0.09782478,
       0.08957662, 0.09480254, 0.09866428, 0.09613704])

In [61]:
from scipy.spatial.distance import cdist

def silhouette(X: np.ndarray, Y: np.ndarray = None, centers: np.ndarray = None,
               result: str = 'mean'):
    if Y is None:
        assert centers is not None
        labels = cdist(X, centers).argmin(axis=1)
    elif centers is None:
        assert (Y is not None) & (Y.shape[0] == X.shape[0])
        labels = Y.copy()
    else:
        print("Error, no partition passed")
        return None

    # intermediate step
    bool_matrix = np.tile(labels, (labels.shape[0], 1)) == np.tile(labels, (labels.shape[0], 1)).T
    np.fill_diagonal(bool_matrix, False)
    distances = cdist(X, X)
    unique, count = np.unique(labels, return_counts=True)
    # array of a(i)
    a = np.sum(distances * bool_matrix, axis=1) / np.sum(bool_matrix, axis=1)
    # array of b(i)
    b = np.zeros(labels.shape[0])
    for obs in range(labels.shape[0]):
        distance = np.zeros(unique.shape[0])
        for label in range(unique.shape[0]):
            distance[label] = np.sum(distances[obs, np.where(labels == unique[label])], axis=1) / count[label]
        distance[int(labels[obs])] = np.inf
        b[obs] = np.min(distance)

    return np.nan_to_num((b - a) / np.max([a, b], axis=0))

def silhouette_axis(Y: np.ndarray, X: np.ndarray, method: str = 'mean'):
    return silhouette(X=X, Y=Y, centers=None, result=method)


def silhouette_matrix(X: np.ndarray, L: np.ndarray, method: str = 'mean', aggregation=None):
    silhouettes = np.zeros((L.shape[0], L.shape[1]))
    for i in range(L.shape[0]):
        silhouettes[i] = np.apply_along_axis(silhouette_axis, 1, L[i], X, method).reshape(-1)
    if aggregation is None:
        return silhouettes
    else:
        return aggregation(silhouettes, axis=1)

In [43]:
silhouettes = np.zeros((L.shape[0], L.shape[1]))
for i in range(L.shape[0]):
    %time res = np.apply_along_axis(silhouette_axis, 1, L[i], X, 'mean').reshape(-1)
    print(res)
    silhouettes[i] = res

Wall time: 1.82 s
[0.49729098 0.49729098 0.49729098 0.49729098 0.49729098 0.49729098
 0.49729098 0.40247204 0.49729098 0.49729098 0.49729098 0.49729098
 0.49729098 0.49729098 0.49729098 0.49729098 0.49729098 0.40247204
 0.49729098 0.49729098 0.49729098 0.49729098 0.49729098 0.49729098
 0.49729098 0.49729098 0.40247204 0.49729098 0.49729098 0.49729098]
Wall time: 2.23 s
[0.58480058 0.55004296 0.58480058 0.55004296 0.58480058 0.58480058
 0.58480058 0.55004296 0.52758758 0.55004296 0.55004296 0.52758758
 0.57300023 0.58480058 0.55004296 0.52758758 0.58480058 0.55004296
 0.52758758 0.55004296 0.52758758 0.52758758 0.55004296 0.52758758
 0.55004296 0.58480058 0.58480058 0.52758758 0.55004296 0.58480058]
Wall time: 2.64 s
[0.66156021 0.66156021 0.64426733 0.59170932 0.64426733 0.59170932
 0.66156021 0.59170932 0.66156021 0.64426733 0.66156021 0.59170932
 0.66156021 0.64426733 0.64426733 0.59170932 0.64426733 0.66156021
 0.64426733 0.64426733 0.59170932 0.64426733 0.64426733 0.66156021
 0.644

  a = np.sum(distances * bool_matrix, axis=1) / np.sum(bool_matrix, axis=1)


Wall time: 8.88 s
[0.09836578 0.09416901 0.09329862 0.09000824 0.09550019 0.09293794
 0.09094661 0.09578626 0.09724312 0.09289456 0.09783043        nan
 0.0921336  0.09550187 0.09662656 0.09021766 0.08853723 0.09168554
 0.08984439 0.09488306 0.09250643 0.09365933 0.09868817 0.09295382
 0.09388518 0.09472891 0.0910891  0.09521963 0.09036701 0.09133763]
Wall time: 9.37 s
[0.09181355 0.09662495 0.0958886  0.09336816 0.09444552 0.09574704
 0.09535505 0.09453307 0.0997734  0.0949264  0.09608895 0.09079186
 0.09314967 0.09046047 0.09097189 0.09666019 0.09259317 0.09271152
 0.0988557  0.09000433 0.09687849 0.0957567  0.09750196 0.09416739
 0.09096434 0.09550588 0.09535505 0.09143983 0.09667431 0.09140474]
Wall time: 9.66 s
[0.0971403  0.08872195 0.09271581 0.09894296 0.09208815 0.09258096
 0.10527905 0.09534812 0.09106757 0.09547338 0.08594181 0.08906896
 0.09578564 0.09216463 0.09654279 0.09086943 0.09053464 0.09473276
 0.09210248 0.09954933 0.09052043 0.09247623 0.09890372 0.08780478
 0.084

In [63]:
silhouette(X, L[-1,-5,:]).mean()

  a = np.sum(distances * bool_matrix, axis=1) / np.sum(bool_matrix, axis=1)


0.09205880909846935