In [1]:
import numpy as np
import time
import math
from sklearn.cluster import KMeans

In [2]:
class Cluster:
    def __init__(self, N, LS, SS):
        """
        N : int 
            number of data point
        LS : numpy array of length d
            linear sum of data points in each attribute 
        SS : numpy array of length d
            squared sum of data points in each attribute 
        """
        self.N = N
        self.LS = LS
        self.SS = SS
    
    def update_cluster_feature(self, x):
        x = x.reshape((-1,1))
        self.LS = self.LS + x
        self.SS = self.SS + np.power(x, 2)
        self.N += 1
    
    def get_centroid(self):
        return self.LS / self.N
    
    def reset_cluster_feature(self):
        self.N = 0;
        self.LS = np.array([0]*d).reshape((-1,1))
        self.SS = np.array([0]*d).reshape((-1,1))

class ClusterKMeans():
    def __init__(self, centroid, N):
        self.centroid = centroid
        self.N = 0
    
    def update_cluster_feature(self, x):
        self.N += 1
        self.centroid = self.centroid + (x-self.centroid)/self.N
    
    def reset_cluster_feature(self):
        self.N = 0

class ClusterKMeansB(ClusterKMeans):
    def __init__(self, centroid, N, label):
        self.points = list()
        self.label = label
        ClusterKMeans.__init__(self, centroid, N)
        
    def update_cluster_feature(self, x):
        self.points.append(x)
        super(ClusterKMeansB, self).update_cluster_feature(x)
    
    def reset_cluster_feature(self):
        self.points = list()
        super(ClusterKMeansB, self).reset_cluster_feature()

## Approach 1 - Sequential k-means
https://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm 

In [3]:
class SequentialKMeans:
    def __init__(self, k, d, init_data=None):
        """
        k : number of clusters
        d : number of attributes
        init_data : when it is not None, it is a numpy array of size n x d
        """
        self.k  = k
        self.d = d
        self.clusters = [None] * k
        if init_data is None:
            for i in range(k):
                centroid = np.random.rand(d)
                self.clusters[i] = ClusterKMeans(centroid, 0)
        else:
            kmeans = KMeans(n_clusters=k, random_state=0).fit(init_data)
            centroids = kmeans.cluster_centers_
            for i in range(k):
                centroid = centroids[i]
                self.clusters[i] = ClusterKMeans(centroid, 0)
        self.y = list()
    
    def absorb_incoming_datum(self, x, label=False):
        idx = 0
        min_dist = np.linalg.norm(self.clusters[idx].centroid- x)
        for i in range(1, self.k):
            dist = np.linalg.norm(self.clusters[i].centroid- x)
            if dist < min_dist:
                idx = i
                min_dist = dist
        self.clusters[idx].update_cluster_feature(x)
        if label:
            self.y.append(idx)

## Approach 2 Sequential K-Means with Temporary Buffers

In [4]:
class SequentialKMeansB():
    def __init__(self, k, d, init_data=None):
        """
        k : number of clusters
        d : number of attributes
        init_data : when it is not None, it is a numpy array of size n x d
        """
        self.k  = k
        self.d = d
        self.clusters = [None] * k
        if init_data is None:
            for i in range(k):
                centroid = np.random.rand(d)
                self.clusters[i] = ClusterKMeansB(centroid, 0, i)
        else:
            kmeans = KMeans(n_clusters=k, random_state=0).fit(init_data)
            centroids = kmeans.cluster_centers_
            for i in range(k):
                centroid = centroids[i]
                self.clusters[i] = ClusterKMeansB(centroid, 0, i)
    
    def absorb_incoming_datum(self, x):
        idx = 0
        min_dist = np.linalg.norm(self.clusters[idx].centroid- x)
        for i in range(1, self.k):
            dist = np.linalg.norm(self.clusters[i].centroid- x)
            if dist < min_dist:
                idx = i
                min_dist = dist
        self.clusters[idx].update_cluster_feature(x)
    
    def reset_buffer(self):
        for i in range(self.k):
            self.clusters[i].reset_cluster_feature()