<a href="https://colab.research.google.com/github/foyez-hub/MeanKmeans/blob/main/kmeans_IN_Bigdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import pandas as pd


Data load

In [None]:
iris = load_iris()
X = iris.data


Elbow Method

In [None]:
class ElbowMethod:
    def __init__(self, max_clusters, n_init):
        self.max_clusters = max_clusters
        self.n_init = n_init
        self.inertia_values = []

    def calculate_inertia(self, data):
        self.inertia_values = []
        possible_k_values = range(1, self.max_clusters + 1)

        for k in possible_k_values:
            kmeans = KMeans(n_clusters=k, n_init=self.n_init)
            kmeans.fit(data)
            self.inertia_values.append(kmeans.inertia_)

    def plot_elbow_curve(self):
        plt.plot(range(1, self.max_clusters + 1), self.inertia_values, marker='o')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Inertia')
        plt.title('Elbow Method for Optimal k')
        plt.show()

    def find_best_k(self):
        diff_inertia = [self.inertia_values[i] - self.inertia_values[i+1] for i in range(len(self.inertia_values)-1)]
        best_k = diff_inertia.index(max(diff_inertia)) + 2
        return best_k



call elbow Method for Iris Dataset





In [None]:

elbow = ElbowMethod(max_clusters=10, n_init=100)
elbow.calculate_inertia(X)
elbow.plot_elbow_curve()
best_k = elbow.find_best_k()
print("Best k:", best_k)


Normal_Kmeans

In [None]:
class NormalKmeans:
    def __init__(self, X, max_iters, max_clusters, k=None):
        self.max_iters = max_iters
        self.data = X

        if k is None:
            elbow = ElbowMethod(max_clusters=max_clusters, n_init=max_iters)
            elbow.calculate_inertia(self.data)
            self.k = elbow.find_best_k()
        else:
            self.k = k

        self.kmeans = None
        self.labels = None
        self.centers = None
        self.iterations = None  # Add this line

    def fit(self):
        self.kmeans = KMeans(n_clusters=self.k, n_init=self.max_iters)
        self.kmeans.fit(self.data)
        self.labels = self.kmeans.labels_
        self.centers = self.kmeans.cluster_centers_
        self.iterations = self.kmeans.n_iter_  # Use n_iter_ attribute instead of get_iterations()

    def visualize(self):
        plt.scatter(self.data[:, 0], self.data[:, 1], c=self.labels, cmap='viridis')
        plt.scatter(self.centers[:, 0], self.centers[:, 1], marker='X', s=200, c='red')
        plt.xlabel('Sepal Length (scaled)')
        plt.ylabel('Sepal Width (scaled)')
        plt.title('K-means Clustering')
        plt.show()

    def silhouette_score(self):
        silhouette_avg = silhouette_score(self.data, self.labels)
        return silhouette_avg

    def get_iterations(self):
        return self.iterations

Calling normal Kmeans

In [None]:

normalkmeans = NormalKmeans(X,max_iters=100,max_clusters=10,k=2)
normalkmeans.fit()
normalkmeans.visualize()

silhouette_avg = normalkmeans.silhouette_score()
print("Silhouette Score:", silhouette_avg)


print("Iterations:", normalkmeans.get_iterations())


class of Meankmeans


In [None]:

class MeanKMeans:
    def __init__(self,X,max_iters,max_clusters,k):
        self.X=X

        if(k==None):
          elbow = ElbowMethod(max_clusters=max_clusters, n_init=max_iters)
          elbow.calculate_inertia(self.X)
          self.k = elbow.find_best_k()
        else:
          self.k=k

        self.max_iters = max_iters

    def meanOf_K_Current_Clusters(self, labels, X, centroids):
        k = len(centroids)
        mean_list = []
        for i in range(k):
            cluster_points = []
            for each_index in range(len(labels)):
                if labels[each_index] == i:
                    cluster_points.append(X[each_index])
            cluster_points = np.array(cluster_points)
            if len(cluster_points) > 0:
                mean = cluster_points.mean(axis=0)
            else:
                mean = centroids[i]
            mean_list.append(mean)
        return mean_list

    def fit(self):
        iterations = 0
        np.random.seed(42)
        centroids = self.X[np.random.choice(self.X.shape[0], self.k, replace=False)]

        for _ in range(self.max_iters):
            iterations += 1
            labels = []
            for point in X:
                distances = []
                means = self.meanOf_K_Current_Clusters(labels, self.X, centroids)
                for each_mean in means:
                    distance = np.sqrt(np.sum((point - each_mean) ** 2))
                    distances.append(distance)
                distances = np.array(distances)
                labels.append(np.argmin(distances))

            new_centroids = []
            for i in range(self.k):
                cluster_points = []
                for each_index in range(len(self.X)):
                    if labels[each_index] == i:
                        cluster_points.append(self.X[each_index])
                cluster_points = np.array(cluster_points)
                if len(cluster_points) > 0:
                    new_centroids.append(cluster_points.mean(axis=0))
                else:
                    new_centroids.append(centroids[i])

            new_centroids = np.array(new_centroids)

            if np.all(centroids == new_centroids):
                break
            centroids = new_centroids
            self.centroids = centroids
            self.labels = labels
        self.iterations=iterations

    def visualize(self):
        plt.scatter(self.X[:, 0], self.X[:, 1], c=self.labels, cmap='viridis')
        plt.scatter(self.centroids[:, 0], self.centroids[:, 1], c='red', marker='x', s=100)
        plt.xlabel('Sepal Length')
        plt.ylabel('Sepal Width')
        plt.title('KMeans Clustering')
        plt.show()

    def silhouette_score(self):
      silhouette_avg = silhouette_score(self.X, self.labels)
      return silhouette_avg

    def iterations_use(self):
      return self.iterations










apply MeansKmeans in irisDataset






In [None]:
meankmeans = MeanKMeans(X,max_iters=100,max_clusters=10,k=None)
meankmeans.fit()
meankmeans.visualize()
meankmeans.silhouette_score()
print("silhouette score->",meankmeans.silhouette_score())
print("Iterations->",meankmeans.iterations_use())

NormalKmeans VS MeanKmeans

In [None]:
normalkmeans_iterations=[]
Meankmeans_iterations=[]
y=[]
for i in range(2,11):
   y.append(i)
   normalkmeans = NormalKmeans(X,max_iters=100,max_clusters=10,k=i)
   normalkmeans.fit()
  #  normalkmeans.visualize()
   silhouette_avg = normalkmeans.silhouette_score()
  #  print("Silhouette Score of NormalKmeans:", silhouette_avg)
  #  print("Iterations of NormalKmeans:", normalkmeans.get_iterations())
   normalkmeans_iterations.append(normalkmeans.get_iterations())
   meankmeans = MeanKMeans(X,max_iters=100,max_clusters=10,k=i)
   meankmeans.fit()
  #  meankmeans.visualize()
   meankmeans.silhouette_score()
  #  print("silhouette score of meankmeans ->",meankmeans.silhouette_score())
  #  print("Iterations of meankmeans->",meankmeans.iterations_use())
   Meankmeans_iterations.append(meankmeans.iterations_use())


plt.plot(y, normalkmeans_iterations, label='NormalKMeans')
plt.plot(y, Meankmeans_iterations, label='MeanKMeans')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Iterations')
plt.title('Iteration Comparison between NormalKMeans and MeanKMeans')
plt.legend()
plt.grid(True)
plt.show()



Divide the iris dataset in parts

In [None]:


n_parts = 5
samples_per_part = int(len(X)/n_parts)

X_divided = {}

for i in range(n_parts):
    start_idx = i * samples_per_part
    end_idx = (i + 1) * samples_per_part
    X_divided[f'part_{i + 1}'] = X[start_idx:end_idx]

for part, data in X_divided.items():
    print(f'{part}: Shape {data.shape}')
    # print(data)


Apply MeanKmeans in Each part

In [None]:
for data in X_divided.items():

  meankmeans = MeanKMeans(X,max_iters=100,max_clusters=10,k=None)
  meankmeans.fit()
  meankmeans.visualize()
  meankmeans.silhouette_score()
  print("silhouette score of meankmeans ->",meankmeans.silhouette_score())
  print("Iterations of meankmeans->",meankmeans.iterations_use())
