In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import cluster
from sklearn import preprocessing
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('dark_background')

In [17]:
class KMeansClustering:
    
    def __init__(self, X, num_clusters, max_iterations):
        self.K = num_clusters
        self.max_iterations = max_iterations
        self.num_samples, self.num_features = X.shape
    
    def initialize_random_centroids(self, X):
        centroids = np.zeros((self.K, self.num_features))
        for k in range(self.K):
            centroid = X[np.random.choice(range(self.num_samples))]
            centroids[k] = centroid
        return centroids
    
    def create_clusters(self, X, centroids):
        clusters = [[] for _ in range(self.K)]
        for idx, row in enumerate(X):
            min_dist_centroid_idx = np.argmin(np.sqrt(np.sum((centroids - row)**2, axis=1)))
            clusters[min_dist_centroid_idx].append(idx)
        return clusters
    
    def find_new_centroids(self, X, clusters):
        centroids = np.zeros((self.K, self.num_features))
        for idx, cluster in enumerate(clusters):
            centroids[idx] = np.mean(X[cluster], axis=0)
        return centroids
    
    def predict(self, X, clusters):
        y_pred = np.zeros(self.num_samples)
        for idx, cluster in enumerate(clusters):
            for row in cluster:
                y_pred[row] = idx
        return y_pred
    
    def plot_fig(self, X, y):
        fig = px.scatter(X[:, 0], X[:, 1], color=y)
        fig.show()
    
    def fit(self, X):
        centroids = self.initialize_random_centroids(X)
        for i in range(self.max_iterations):
            clusters = self.create_clusters(X, centroids)
            prev_centroids = centroids
            centroids = self.find_new_centroids(X, clusters)
            diff = centroids - prev_centroids
            if not diff.any(): 
                break
        y_pred = self.predict(X, clusters)
        self.plot_fig(X, y_pred)

In [18]:
np.random.seed(10)
num_clusters = 3
max_iterations = 100
X, _ = make_blobs(n_samples=1000, n_features=2, centers=num_clusters) # create dataset using make_blobs from sklearn datasets
Kmeans = KMeansClustering(X, num_clusters, max_iterations)
y_pred = Kmeans.fit(X)