# K-means Clustering

In [33]:
import numpy as np
class Kmeans():
    def __init__(self, num_iter = 1000, k = 5, tol = 1e-4):
        self.num_iter = num_iter
        self.k = k
        self.tol = tol
        self.centroids = None
        self.clusters = None
        
    def fit(self, X):
        #Here, X is a (nxd) dimensioned matrix, where n is the number of points and d being the dimension
        (n,d) = X.shape
        centroid_indices = np.random.choice(n, self.k, replace=False)
        centroids = X[centroid_indices]
        new_centroids = np.copy(centroids)
        clusters = [None]*n
        
        for z in range(self.num_iter):
            for i in range(n):
                distances = np.array([np.linalg.norm(X[i]-centroids[j]) for j in range(self.k)])
                cluster = np.argmin(distances)
                clusters[i] = cluster
                
            for j in range(self.k):
                cluster_points = X[np.where(clusters == j)]
                if len(cluster_points)>0:
                    new_centroids[j] = np.mean(cluster_points, axis = 0)
            
            if z>0 and np.linalg.norm(centroids - new_centroids) < self.tol:
                break
            
            centroids = np.copy(new_centroids)
            
        self.centroids = centroids
        self.clusters = clusters
        
    def predict(self, X):
        (m,d) = X.shape
        
        clusters = [None]*m
        for i in range(m):
            distances = np.array([np.linalg.norm(X[i]-self.centroids[j]) for j in range(self.k)])
            cluster = np.argmin(distances)
            clusters[i] = cluster
        return clusters
        

In [45]:
x1 = np.random.randn(5,2) + 20
x2 = np.random.randn(5,2) - 20
X = np.concatenate([x1,x2], axis=0)

# Initialize the KMeans object with k=2
kmeans = Kmeans(k=2)

# Fit the k-means model to the dataset
kmeans.fit(X)

# Get the cluster assignments for the input dataset
cluster_assignments = kmeans.predict(X)

# Print the cluster assignments
print(cluster_assignments)

# Print the learned centroids
print(kmeans.centroids) 

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
[[ 20.92488671  21.69584693]
 [-20.15791465 -20.69651454]]


  cluster_points = X[np.where(clusters == j)]
