In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

import numpy as np
class MyKmeans:
    def __init__(self, k, max_iters=100):
        self.k = k
        self.max_iters = max_iters
        self.centroids = None
        self.cluster_assignments = None
        self.cluster_labels = None

    def _initialize_centroids(self, X):
        np.random.seed(42)
        indices = np.random.choice(len(X), self.k, replace=False)
        self.centroids = X[indices]

    def _compute_similarities(self, X):
        return cosine_similarity(X, self.centroids)

    def fit(self, X, labels):
        labels = np.array(labels)  

        # Initialize centroids
        self._initialize_centroids(X)

        for _ in range(self.max_iters):
            
            similarities = self._compute_similarities(X)
            self.cluster_assignments = np.argmax(similarities, axis=1)

            # Recompute centroids by averaging vectors in each cluster
            new_centroids = np.array([X[self.cluster_assignments == i].mean(axis=0) for i in range(self.k)])

            # Check for convergence
            if np.all(self.centroids == new_centroids):
                break

            self.centroids = new_centroids

        #Initialize cluster labels list
        self.cluster_labels = [None] * self.k

        for i in range(self.k):
            #Filter labels for the current cluster
            cluster_labels = labels[self.cluster_assignments == i]

            #Find the majority label
            most_common_label = Counter(cluster_labels).most_common(1)

            if most_common_label:
                self.cluster_labels[i] = most_common_label[0][0]
    def predict(self, X_new):
        
        similarities = self._compute_similarities(X_new)
        self.cluster_assignments = np.argmax(similarities, axis=1)

        # Map clusters to labels
        predicted_labels = [str(self.cluster_labels[i]) for i in self.cluster_assignments]

        return predicted_labels

    def get_inertia(self, X):
        
        inertia = sum(
            np.linalg.norm(np.array(x) - self.centroids[c_idx]) ** 2 for x, c_idx in zip(X, self.cluster_assignments))
        return inertia