In [1]:
# See Here: https://medium.com/@avijit.bhattacharjee1996/implementing-k-means-clustering-from-scratch-in-python-a277c23563ac

In [2]:
# Understanding the K-Means Algorithm
# The K-Means algorithm is relatively simple yet effective. Here are the basic steps:

# 1.Choose the number of clusters, k, that you want to create.
# 2.Initialize k cluster centroids randomly.
# 3.Assign each data point to the nearest centroid, creating k clusters.
# 4.Recalculate the centroids as the mean of all data points in each cluster.
# 5.Repeat steps 3 and 4 until convergence (centroids no longer change significantly) or for a specified number of iterations.

In [52]:
import numpy as np  
from sklearn.datasets import make_blobs

class KMeans: 
    
    def __init__(self, n_clusters, max_iters=100):
        """
        Initialize the KMeans instance with number of clusters and max iterations.
        """
        self.n_clusters = n_clusters
        self.max_iters = max_iters

    def fit(self, X):
        """
        Fit the KMeans model to the data.
        """
        # Randomly select initial centroids from the input data points
        self.centroids = X[np.random.choice(X.shape[0], self.n_clusters, replace=False)]

        for _ in range(self.max_iters):
            # Assign each data point to the nearest centroid
            labels = self._assign_labels(X)

            # Compute new centroids as the mean of assigned points
            new_centroids = self._update_centroids(X, labels)
            
            # Stop iteration if centroids do not change (convergence)
            if np.all(self.centroids == new_centroids):
                break
                
            # Update centroids for the next iteration
            self.centroids = new_centroids

    def _assign_labels(self, X):
        """
        Assign each data point to the closest centroid.
        """
        # Compute Euclidean distance between each point and each centroid
        # print(np.linalg.norm(X[:, np.newaxis] - self.centroids,axis=2))
        # See Here: https://numpy.org/doc/2.2/reference/generated/numpy.linalg.norm.html
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        # print(distances)
        
        # Return the index (label) of the closest centroid for each data point
        return np.argmin(distances, axis=1)

    def _update_centroids(self, X, labels): 
        """
        Calculate new centroids as the mean of all points assigned to each cluster.
        """
        # print('lables',labels)
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.n_clusters)])
        return new_centroids

# Generate synthetic 2D data with 3 centers using sklearn's make_blobs
X, _ = make_blobs(n_samples=300, centers=3, random_state=42)

# Create a KMeans instance with 3 clusters
kmeans = KMeans(n_clusters=3)

# Fit the model to the synthetic data
kmeans.fit(X)

# Predict cluster labels for each point in the dataset
labels = kmeans._assign_labels(X)

# Print the resulting cluster labels and final centroid coordinates
# print("Cluster Assignments:", labels) 
# print("Final Centroids:", kmeans.centroids)

[[15.05706878 14.73334688 17.4763239 ]
 [15.04968649 14.7478382  17.12687466]
 [ 7.60630036  8.11477728  2.09026491]
 [ 0.91196179  1.34277884  9.80098548]
 [16.35378404 16.06066028 18.06696481]
 [ 1.87315653  1.65079934 11.43301758]
 [ 9.22080621  9.75209582  0.86682661]
 [ 1.6834525   1.28635349 11.37326493]
 [ 9.26203609  9.7627254   0.46367221]
 [10.35884613 10.86295881  0.6644068 ]
 [10.84157473 11.3826113   1.62191648]
 [ 1.72624566  1.28674696 11.42252682]
 [ 7.5872976   8.04949898  2.36530177]
 [ 9.81464231 10.2842531   0.98336712]
 [14.95749236 14.64798176 17.17155223]
 [ 9.32993803  9.82080176  0.560349  ]
 [15.37742987 15.10491356 16.93519409]
 [ 1.16740078  1.26888632  9.16025716]
 [ 9.71517119 10.25614609  1.07058866]
 [10.41442348 10.98375642  2.33228307]
 [ 7.66453011  8.19470413  2.11877296]
 [ 9.45836827  9.98170334  0.54644709]
 [ 1.97950049  1.76793594 11.51421081]
 [14.3776863  14.06284384 16.78408249]
 [ 9.06021194  9.55352212  0.73082112]
 [12.81801238 12.53907235