In [14]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from scipy.spatial.distance import cdist
from scipy.linalg import norm

In [4]:
n_samples = 500
seed = 30

In [5]:
blobs = datasets.make_blobs(n_samples=n_samples, random_state=seed)

In [7]:
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)

In [9]:
X, y = blobs
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)

In [10]:
kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(X)

In [11]:
kmeans.labels_

array([0, 1, 1, 0, 2, 0, 2, 0, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 1, 2, 0, 2,
       1, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 2, 0, 1, 0, 1, 1, 2, 2, 2,
       1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 0, 2, 2, 1, 2, 1, 2, 0, 0, 0,
       1, 2, 0, 2, 2, 0, 2, 1, 1, 1, 0, 2, 0, 2, 1, 2, 0, 0, 1, 0, 1, 0,
       1, 0, 2, 0, 1, 2, 1, 1, 1, 0, 1, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 1,
       1, 2, 0, 1, 2, 1, 0, 0, 2, 0, 0, 2, 2, 2, 0, 0, 0, 1, 1, 2, 2, 1,
       2, 1, 2, 0, 0, 2, 1, 0, 1, 1, 0, 2, 2, 0, 1, 1, 2, 2, 1, 0, 1, 0,
       2, 0, 0, 0, 1, 1, 2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 2, 0, 0, 0, 2, 0,
       2, 0, 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 2, 0,
       2, 0, 2, 2, 2, 0, 1, 0, 2, 2, 2, 0, 0, 1, 2, 1, 0, 0, 1, 0, 2, 1,
       0, 2, 1, 1, 2, 0, 1, 1, 0, 2, 1, 1, 2, 0, 1, 0, 2, 2, 1, 1, 2, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 2, 0, 2, 2, 2, 0, 2, 1, 1, 1, 2, 2, 2, 0,
       1, 1, 1, 2, 2, 1, 0, 1, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 1, 2, 2, 0,
       1, 0, 0, 2, 1, 1, 1, 0, 2, 2, 0, 1, 1, 1, 1,

In [19]:
kmeans.cluster_centers_

array([[-0.72291674,  0.74936531],
       [ 1.34657245,  0.49510722],
       [-0.6142915 , -1.26563961]])

# Kmeans from scratch

In [41]:
class hKmeans:
    def __init__(self, n_clusters, random_state, max_iter=300, tol=1e-4):
        """Args:
        n_clusters: int, number of clusters
        random_state: int, control randomness by seed
        max_iter: int, the number of iterations
        tol: float, the frobinius norm of differences between two consecutive iterations
        """
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.max_iter = max_iter
        self.tol = tol
        
    def fit(self, X):
        # random initialization
        np.random.seed(self.random_state)
        N, dim = X.shape
        init = np.random.choice(N, self.n_clusters)
        # initial cluster coordinates
        cluster_centers_ = X[init,:]
        n_iter = 0
        while n_iter < self.max_iter:
            # pairwise distance between points to cluster centers
            dists = cdist(X, cluster_centers_, metric='euclidean')
            clusters = np.argmin(dists, axis=1)
            # new cluster centers
            new_cluster_centers_ = np.array([np.mean(X[clusters==idx,:], axis=0) for idx in range(self.n_clusters)])
            # compute the difference between updates and get norm
            diff = new_cluster_centers_ - cluster_centers_
            gap = norm(diff, 'fro')
            # exit condition
            if gap<self.tol:
                break
            
            n_iter += 1
            cluster_centers_ = new_cluster_centers_
        
        self.cluster_centers_ = cluster_centers_
        dists = cdist(X, cluster_centers_, metric='euclidean')
        clusters = np.argmin(dists, axis=1)
        return clusters

In [42]:
hkmeans = hKmeans(n_clusters=3, random_state=0)

In [43]:
clusters = hkmeans.fit(X)

In [44]:
clusters

array([2, 0, 0, 2, 1, 2, 1, 2, 0, 1, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 2, 1,
       0, 1, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 1, 1, 2, 0, 2, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 0, 1, 0, 1, 2, 2, 2,
       0, 1, 2, 1, 1, 2, 1, 0, 0, 0, 2, 1, 2, 1, 0, 1, 2, 2, 0, 2, 0, 2,
       0, 2, 1, 2, 0, 1, 0, 0, 0, 2, 0, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 2, 0, 1, 0, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 0, 0, 1, 1, 0,
       1, 0, 1, 2, 2, 1, 0, 2, 0, 0, 2, 1, 1, 2, 0, 0, 1, 1, 0, 2, 0, 2,
       1, 2, 2, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 2, 0, 0, 1, 2, 2, 2, 1, 2,
       1, 2, 2, 1, 2, 1, 2, 0, 1, 2, 1, 2, 2, 2, 2, 2, 0, 1, 1, 2, 1, 2,
       1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 1, 2, 2, 0, 1, 0, 2, 2, 0, 2, 1, 0,
       2, 1, 0, 0, 1, 2, 0, 0, 2, 1, 0, 0, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2,
       0, 2, 2, 0, 0, 0, 2, 2, 1, 2, 1, 1, 1, 2, 1, 0, 0, 0, 1, 1, 1, 2,
       0, 0, 0, 1, 1, 0, 2, 0, 1, 1, 2, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 2,
       0, 2, 2, 1, 0, 0, 0, 2, 1, 1, 2, 0, 0, 0, 0,

In [45]:
hkmeans.cluster_centers_

array([[ 1.34657245,  0.49510722],
       [-0.61290806, -1.27173436],
       [-0.72361238,  0.74339198]])