# Imports and Defaults

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sns.set_theme()
seed = 1
rng = np.random.default_rng(seed)

# Utility Functions

In [66]:
def gaussian_sample(dim, num_gaussians, means, widths, num_data_pts=1000):
    # TODO: code currently ignores dimensions
    
    X = np.zeros(shape=(num_data_pts, 2))
    for i in range(num_gaussians):
        X += rng.multivariate_normal(means[i], widths[i] * np.eye(2), size=(num_data_pts))
    return X

def euclidean_dist(u,v):
    return np.power(np.power(u - v, 2).sum(), 1/2)

# K Means

In [85]:
class KMeans:
    def __init__(self, X, k, metric):
        """ Class for K means clustering

        Args:
            X ([m x n]): m data points in n dimensional space
            k (int): num clusters
            metric (callable): function to compute distance
        """
        
        self.X = X # data to cluster
        self.k = k # num clusters
        self.metric = metric # callable metric
        self.m = self.X.shape[0] # num data points
        
    def init_means(self):
        """ Select k data points to initialize k means (Forgy method)
        """
        
        idx = rng.integers(low=0, high=self.m, size=self.k) 
        print(idx)
        self.means = self.X[idx]
        
    def assign_clusters(self):
        self.assignments = [None] * self.m
        # iterate over all m datapoints
        for i in range(self.m):
            x_i = self.X[i,:] # data point
            min_dist = np.array([0.]) # init min dist
            min_assignement = None # init min assignment
            
            # iterate over all means
            for j in range(self.k):
                x_j = self.means[j,:] # cluster mean
                cur_dist = self.metric(x_i, x_j) # distance btwn data point and cluster mean
                
                # update min distance and cluster assignement
                if cur_dist < min_dist:
                    min_dist = cur_dist
                    min_assignement = j
                    
            self.assignments[i] = (min_assignement, min_dist) # save cluster assignement         

    def update_clusters(self):
        # iterate over all k means
        for i in range(self.k):
            total_dist, num_pts = 0, 0
            
            # iterate over all data points in k-th mean
            for (assignment, dist) in self.assignments:
                if assignment == i:
                    total_dist += dist
                    num_pts += 1
            
            # update k-th mean
            print(total_dist, num_pts)
            self.means[self.k, :] = total_dist / num_pts
      
    def fit(self):
        self.init_means()
        prev_assignments = self.assign_clusters()
        
        self.update_clusters()
        self.assign_clusters()
        
        while self.assignments != prev_assignments:
            self.update_clusters()
            prev_assignments = self.assignments
            self.assign_clusters()

In [86]:
means = [np.array([0,0]), np.array([5,5])]
widths = [1, 1]
num_gaussians = len(means)
num_data_pts = 1000
dim = None # TODO: currently not relevant to sampling code

X = gaussian_sample(dim, num_gaussians, means, widths, num_data_pts=num_data_pts)

In [87]:
kmeans = KMeans(X, 2, euclidean_dist)
kmeans.fit()

[141 203]
0 0


ZeroDivisionError: division by zero