# k-means clustering

In [279]:
import scipy as sp
import numpy as np 
import matplotlib

# generate random data points in 2D 
rng = np.random.default_rng(seed = 1234)
cl1 = rng.multivariate_normal([-2,-2], [[1,-0.5],[-0.5,1]], size = 100)
cl2 = rng.multivariate_normal([1,0], [[1,0],[0,1]], size = 150)
cl3 = rng.multivariate_normal([3,2], [[1,-0.7],[-0.7,1]], size = 200)
pts = np.concatenate((cl1,cl2,cl3))


In [302]:
""" 
    function naive_kmeans_1: 
        input: 
                X - dataset of points in R^d
                K - amount of clusters 
                iterations - number of iterations (default 100) 
        output: 
                clusters - the labels assigning each point to a cluster in (0,...,K)
                centers- coordinates of K centers of the clusters 
"""


def naive_kmeans(X, K, iterations = 100):
    # step 1 - initialise 
    length = np.size(X,axis=0)
    centers = X[np.random.choice(length, size = K, replace = False)]
    
    for _ in range(iterations):
        
        # step 2 - assignment step 
        normed_mat = np.empty((450,0))
        
        for center in centers: #loop over centers (shape = (K,2)
            column = np.linalg.norm(X - center, axis = 1)
            column = column[:,None] # vectorise the column
            normed_mat = np.hstack((normed_mat,column)) # (450xK)-dimensional matrix containing the norms
            
        clusters = np.argmin(normed_mat, axis = 1) #1d array of positions
        
        # step 3 - update step 
        updated_centers = np.array([X[clusters == i].mean(axis=0) for i in range(K)]) 
        centers = updated_centers
        
    return clusters, centers                  

In [303]:
x, y = naive_kmeans(pts, 3)