# K-Means from scratch using numpy

Unsupervised learning method used for clustering of data.  

$\text{Centroid}_i$ = $\dfrac {\sum X_j} {n} : X_j \in \text{Cluster}_i$  
$X$ represents the coordinates of the data points.  

Steps:
 - Initialize Centroids randomly
 - Assign cluster as per nearest ccentroid
 - Update centroids as per cluster data points
 - Repeat last 2 steps until centroids are converged

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def euclideanDistance(x1, x2):
  return np.sqrt(np.sum(x1-x2)**2)

In [None]:
class KMeans:
  def __init__(self, k=5, maxIters=100, plotSteps=False):
    self.k = k
    self.maxIters = maxIters
    self.plotSteps = plotSteps

    self.clusters = [[] for _ in range(k)]
    self.centroids = []

  def predict(self, X):
    self.X = X
    self.nSamples, self.nFeatures = X.shape

    np.random.seed(5)
    sampleIdxs = np.random.choice(self.nSamples, self.k, replace=False)
    self.centroids = [self.X[idx] for idx in sampleIdxs]

    oldCentroids = None
    for _ in range(self.maxIters):
      self._createClusters()

      if self.plotSteps:
        self._plotSteps(oldCentroids)

      oldCentroids = np.copy(self.centroids)
      self._getCentroids()

      if self._isConverged(oldCentroids):
        break

    return self._getClusterLabels()

  def _createClusters(self):
    for idx, sample in enumerate(self.X):
      centroidIdx = self._closestCentroid(sample)
      self.clusters[centroidIdx].append(idx)

  def _closestCentroid(self, sample):
    distances = [euclideanDistance(point, sample) for point in self.centroids]
    closestIdx = np.argmin(distances)
    return closestIdx

  def _getCentroids(self):
    self.centroids = np.random.randn(self.k, self.nFeatures)
    for idx, cluster in enumerate(self.clusters):
      centroid = np.mean(self.X[cluster], axis=0)
      self.centroids[idx] = centroid

  def _isConverged(self, oldCentroids):
    distances = [euclideanDistance(
        oldCentroids[i], self.centroids[i]) for i in range(self.k)]
    return np.allclose(distances, 0, atol=10e-3)

  def _getClusterLabels(self):
    labels = np.empty(self.nSamples)
    for idx, cluster in enumerate(self.clusters):
      for sampleIdx in cluster:
        labels[sampleIdx] = idx

    return labels

  def _plotSteps(self, oldCentroids=None):
    fig, ax = plt.subplots()

    for i, cluster in enumerate(self.clusters):
      point = self.X[cluster].T
      ax.scatter(*point)

    if oldCentroids is not None:
      for point in oldCentroids:
        ax.scatter(*point, marker='+', color="lime", linewidth=2)

    for point in self.centroids:
      ax.scatter(*point, marker='x', color="black", linewidth=2)

    plt.show()
    plt.close(fig)

In [None]:
from sklearn import datasets

X, y = datasets.make_blobs(centers=3, n_samples=500,
                           n_features=2, shuffle=True, random_state=40)

classifier = KMeans(k=3, maxIters=20, plotSteps=True)
predictions = classifier.predict(X)