# Learning notes
https://gist.github.com/jakevdp/256c3ad937af9ec7d4c65a29e5b6d454
    

In [11]:

import numpy as np
from sklearn.datasets.samples_generator import make_blobs
points, labels_true = make_blobs(n_samples=5000, centers=4,
                                 random_state=0, cluster_std=0.60)

def dist(x, y):
    return sum((xi - yi) ** 2 for xi, yi in zip(x, y))


def find_labels(points, centers):
    """  
    For each point, compute its distance to EACH center point.
    and pick the center point with minimal distance (its index) as the point's label.
    
    The return is a list of labels corresponding to each input point
    """
    labels = []
    for point in points:
        distances = [dist(point, center) for center in centers]
        labels.append(distances.index(min(distances)))
    return labels


def compute_centers(points, labels):
    """
    Given a set of points and labels, compute the mean:
    The way this is calculated is:
    
    first we aggregate (x,y) of all points that belong to a label
    then, we find the mean of that label/cluster by calculating the mean
    
    Return: new center points
    """
        
    n_centers = len(set(labels))
    n_dims = len(points[0])
    
    # place holder for list of list of center points
    # [ [x1,y1], [x2, y2] ... ]
    
    centers = [[0 for i in range(n_dims)] for j in range(n_centers)]
    
    # counter for each center points
    
    counts = [0 for j in range(n_centers)]
    
    for label, point in zip(labels, points):
        counts[label] += 1
        centers[label] = [a + b for a, b in zip(centers[label], point)]
        
    return [[x / count for x in center] for center, count in zip(centers, counts)]


def kmeans(points, n_clusters):
    centers = points[-n_clusters:].tolist()
    while True:
        old_centers = centers
        labels = find_labels(points, centers)
        centers = compute_centers(points, labels)
        if centers == old_centers:
            break
    return labels


In [10]:
np.unique(labels_true)

array([0, 1, 2, 3])

# Run it

In [21]:
labels = kmeans(points, 10)

In [23]:
len(labels)

5000

In [25]:
x = points[-1]
y = points[-2]
zip(x,y)

<zip at 0x1a1705cbc8>

In [28]:
len(set([1,2,3,2]))

3

In [18]:
points[-10:]

array([[-2.46162605e-03,  4.12290394e+00],
       [-1.24949922e+00,  2.99579490e+00],
       [-1.16304687e+00,  7.11496573e+00],
       [-2.84376389e+00,  2.46595248e+00],
       [-1.47737487e+00,  3.76160270e+00],
       [ 1.09416295e+00,  8.74919423e-01],
       [-9.17999891e-01,  7.58545949e+00],
       [-1.46457066e+00,  7.87822740e+00],
       [-1.31351503e+00,  8.10814501e+00],
       [-2.28284911e+00,  6.81834065e+00]])

In [19]:
points[-10:].tolist()

[[-0.0024616260456247208, 4.122903938531374],
 [-1.2494992244151415, 2.995794895136798],
 [-1.1630468701356098, 7.114965730867699],
 [-2.8437638868074897, 2.465952478731856],
 [-1.4773748668637057, 3.76160270240154],
 [1.0941629501881676, 0.8749194232274551],
 [-0.9179998912300769, 7.585459489593624],
 [-1.4645706600572181, 7.878227403860206],
 [-1.3135150298753686, 8.108145010115154],
 [-2.2828491081262037, 6.8183406522026075]]

# numpy approach