In [1]:
import numpy as np
import pandas as pd

# Clustering

- Algorithm Introduction
    - [wiki](https://en.wikipedia.org/wiki/Cluster_analysis)
    - [scikit-learn documentation](https://scikit-learn.org/stable/modules/clustering.html)
- [Clustering Evaluation and Assessment](https://blog.csdn.net/darkrabbit/article/details/80378597)
    - External Evaluation (Compare with ground truth)
        - [Rand Index](https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index) (RI) & [Adjust Rand Index](https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index) (ARI)
        - [Mutual Information](https://blog.csdn.net/bbbeoy/article/details/72571890) (MI) & [Adjusted Mutual Information](https://en.wikipedia.org/wiki/Adjusted_mutual_information) (AMI)
        - Homogeneity, Completeness, V-measure
    - Internal Evaluation
        - [Silhouette coefficient](https://en.wikipedia.org/wiki/Silhouette_(clustering)) [ˌsɪluˈet]
            - a(i)：for point i, the mean distance to others points in the same cluster.
            - b(i)：for point i, the min mean distance to points in other clusters.
            - s(i) = [b(i) - a(i)] / max{a(i), b(i)}, -1 <= s(i) <= 1

# K-Means 

- [scikit-learn demo](https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py)

In [46]:
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score

from k_means import KMeans 

In [50]:
X, y = load_iris(return_X_y=True)

In [68]:
k_means = KMeans()
k_means.fit(X, k=3, early_stop=10000)

In [75]:
print('silhouette_score:', silhouette_score(X, k_means.labels, metric='euclidean'))
print('adjusted_rand_score:', adjusted_rand_score(k_means.labels, y))
print('adjusted_mutual_info_score:', adjusted_mutual_info_score(k_means.labels, y))

silhouette_score: 0.5099855375938831
adjusted_rand_score: 0.6358670046140765
adjusted_mutual_info_score: 0.6726237400611907
