<a href="https://colab.research.google.com/github/farahelmashad/ml-from-scratch/blob/main/KMeans/KMeans_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KMeans From Scratch

In [2]:
import numpy as np
import matplotlib.pyplot as plt


In [7]:
class KMeans:
  def __init__(self,K=5,max_iter=100,tol=1e-3):
    self.K=K
    self.max_iter=max_iter
    self.tol=tol

    self.clusters=[[] for _ in range(self.K)]

    self.centroids=[]

  def predict(self,X):
    self.X=X
    self.n_samples,self.n_features=X.shape # assuming numpy ndarray

    # initialization:
    random_sample_idxs=np.random.choice(self.n_samples,self.K, replace=False)
    self.centroids=[X[idx] for idx in random_sample_idxs]

    for _ in range(self.max_iter):
      # assign el samples to el closest centroids (create el clusters)
      self.clusters=self._create_clusters(self.centroids)
      centroids_old=self.centroids
      self.centroids=self._get_centroids(self.clusters)
      if self._is_converged(centroids_old, self.centroids):
        break

    return self._get_cluster_labels(self.clusters)

  def _get_cluster_labels(self, clusters):
    # kol sample hat get the label of the cluster it was assigned to:
    labels=np.empty(self.n_samples)
    for cluster_idx, cluster in enumerate(clusters):
      for sample_idx in cluster:
        labels[sample_idx]=cluster_idx
    return labels


  def _create_clusters(self,centroids):
    clusters=[[] for _ in range(self.K)]
    for idx, sample in enumerate(self.X):
      centroid_idx=self._closest_centroid(sample,centroids)
      clusters[centroid_idx].append(idx)
    return clusters



  def _closest_centroid(self,sample,centroids):
    distances=[self._euclidean_distance(sample,point) for point in centroids]
    closest_idx=np.argmin(distances)
    return closest_idx

  def _euclidean_distance(self,x1,x2):
    return np.sqrt(np.sum((x1-x2)**2))


  def _get_centroids(self, clusters):
    centroids=np.zeros((self.K,self.n_features))
    for cluster_idx, cluster in enumerate(clusters):
      cluster_mean=np.mean(self.X[cluster],axis=0)
      centroids[cluster_idx]=cluster_mean

  def _is_converged(self, centroids, old_centroids):
    distances=[self._euclidean_distance(centroids[i],old_centroids[i]) for i in range(self.K)]
    return sum(distances)==0








