K Means Clustering from Scratch

CS Practical - K-Means:

- Step 1 - Pick K random points as cluster centers called centroids.
- Step 2 - Assign each x to the nearest cluster by calculating its distance to each centroid (start with Euclidean).
- Step 3 - Find the new cluster center by taking the average of the assigned points.
- Step 4 - Repeat Step 2 and 3 for a given number of iterations until none of the cluster assignments change.

In [145]:
import numpy as np
import math

In [146]:
r = np.random.rand(4,5)

In [147]:
r

array([[0.5714131 , 0.83991245, 0.56718153, 0.32830356, 0.62643243],
       [0.74186119, 0.47090061, 0.4200386 , 0.77292202, 0.54461888],
       [0.33687512, 0.87775599, 0.70633325, 0.30498082, 0.29570091],
       [0.31984869, 0.27680505, 0.48817518, 0.28765394, 0.67515381]])

In [206]:
class KMeans:
    def __init__(self, x, clusters, iters):
        self.clusters = clusters
        self.iters = iters
        self.x = np.array(x)
    
    def normalize(x):
        return (x-np.mean(x))/x.std()

    
    def distance(self,x1,x2):
        assert x1.shape == x2.shape, f"shapes don't match {x1.shape} != {x2.shape} "
        return math.sqrt(sum([(x1[i]-x2[i])**2 for i in range(len(x1))]))
    
    def generate_random_points(self):
        return np.random.randn(self.clusters,self.x.shape[1])
    
    def get_labels(self):
        labels = []
        for i,point in enumerate(x): 
            min_dist = float('inf')
            best_cluster = None
            for ic, center in enumerate(self.centers):
                dist = self.distance(point,center)
                print(dist,min_dist)
                print(dist<min_dist)
                if dist < min_dist:
                    min_dist = dist
                    best_cluster = ic
            labels.append(best_cluster)
            print(best_cluster)
            print("\n")
        return labels
    
    def set_centers(self,i):
        if i == 0: 
            return self.generate_random_points()
        centers = []
        for i in range(self.clusters):
            centers.append(self.x[self.labels==i].mean(0))
        return np.array(centers)
            
    def fit(self):
        for i in range(self.iters):
            self.centers = self.set_centers(i)
            self.labels = self.get_labels()        

In [207]:
x = np.random.randn(10,50)

In [208]:
k = KMeans(x,2,20)

In [209]:
k.fit()

8.679359388713499 inf
True
9.14697058946264 8.679359388713499
False
0


9.548346564457933 inf
True
7.9450798404186544 9.548346564457933
True
1


10.898341636766535 inf
True
9.073325163798009 10.898341636766535
True
1


9.516644912737023 inf
True
8.857302292288407 9.516644912737023
True
1


9.77468488977105 inf
True
10.902175069888944 9.77468488977105
False
0


9.565347939168449 inf
True
8.716685440876565 9.565347939168449
True
1


9.318864138809277 inf
True
8.501119403306653 9.318864138809277
True
1


8.81737980326537 inf
True
9.356769591488753 8.81737980326537
False
0


10.570424219572168 inf
True
10.790298732649008 10.570424219572168
False
0


9.514211378161075 inf
True
9.98269294622092 9.514211378161075
False
0






AssertionError: shapes don't match (50,) != (10, 50) 

In [211]:
%debug

> [0;32m<ipython-input-206-f7b6bee145a8>[0m(12)[0;36mdistance[0;34m()[0m
[0;32m     10 [0;31m[0;34m[0m[0m
[0m[0;32m     11 [0;31m    [0;32mdef[0m [0mdistance[0m[0;34m([0m[0mself[0m[0;34m,[0m[0mx1[0m[0;34m,[0m[0mx2[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m        [0;32massert[0m [0mx1[0m[0;34m.[0m[0mshape[0m [0;34m==[0m [0mx2[0m[0;34m.[0m[0mshape[0m[0;34m,[0m [0;34mf"shapes don't match {x1.shape} != {x2.shape} "[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0;32mreturn[0m [0mmath[0m[0;34m.[0m[0msqrt[0m[0;34m([0m[0msum[0m[0;34m([0m[0;34m[[0m[0;34m([0m[0mx1[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m-[0m[0mx2[0m[0;34m[[0m[0mi[0m[0;34m][0m[0;34m)[0m[0;34m**[0m[0;36m2[0m [0;32mfor[0m [0mi[0m [0;32min[0m [0mrange[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mx1[0m[0;34m)[0m[0;34m)[0m[0;34m][0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[

In [210]:
k.labels

[0, 1, 1, 1, 0, 1, 1, 0, 0, 0]