In [1]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets.samples_generator import make_blobs
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean as euc

I've decided to make a visualization of the K - Means clustering. We'll store all the centers of clusters in a tree structure. So we need to make a Node class to store the values. Simple class where Node will have a value and its children.

In [2]:
class Node(object):
    def __init__(self, data):
        self.data = data
        self.children = []

    def add_child(self, obj):
        self.children.append(obj)

Recurvise algorithm to count the number of nodes in the tree(not including the head).  

In [3]:
def count_nodes(tree):

    #Set a counter 
    counter = 0
    
    #Set a condition where a recursion needs to return a value,
    #In our case when a tree has no children we return the counter
    if len(tree.children) != 0:
        
        #We go over each child in the tree and call the algorithm on children
        for child in tree.children:
            
            #Counter contains of the number of nodes a child has and + 1 for itself. 
            counter = counter + count_nodes(child) + 1

    return counter

The following code constucts the tree for 2d data. 

In [4]:
def construct2d_tree(cluster, number_of_clusters, center,  model, max_depth, depth):

    #Create a tree by setting the value 
    tree = Node(center)
    data = Node(cluster)

    #While maximum depth is not reached continue clustering 
    if max_depth > depth:
        
        #Fit the model given using the cluster
        model.fit(cluster)
        #Get centers of the clusters
        centers = model.cluster_centers_
        #Get the labels of the clusters
        labels = model.labels_
        
        #Now for each cluster repeat the following:
        for i in range(number_of_clusters):

            #By using labels, get the data for specific cluster
            new_cluster = cluster[np.where(labels == i, True, False)]
            
            #Call the same algorithm on the cluster we got above
            child, data_child = construct2d_tree(new_cluster, number_of_clusters, 
                                                 centers[i],  model, max_depth, depth+1)
            
            #Add the clusters and the new centers to the tree
            tree.add_child(child)
            data.add_child(data_child)

    return tree, data

The function plots the scatter plot for different branch factors where each cluster has its own color.

In [6]:
def my_plot(tree, data, max_depth, branch_factor):
    
    # By layer, I mean the level of the tree. 

    layers_tree = []
    layers_data = []
    # Adding first childen to the lists for data, and for tree with cluster centers 
    layers_data.append(data.children)
    layers_tree.append(tree.children)
    
    f = plt.figure(figsize=(100,100))

    for level in range(max_depth):
        
        #get a list of the nodes at each level and append it to the list containing all layers(levels)
        layer_tree_item =np.array([layers_tree[level][i].children for i in range(len(layers_tree[level]))]).flatten()
        layers_tree.append(layer_tree_item)
        layer_data_item =np.array([layers_data[level][i].children for i in range(len(layers_data[level]))]).flatten()
        layers_data.append(layer_data_item)
    
    #First plot the first cluster with no K-means algorithm applied
    plt.subplot(2, 2, 1)
    plt.scatter(data.data[:, [0]], data.data[:, [1]])
    plt.scatter(tree.data[0][0], tree.data[0][1], marker='^', color='black')
    plt.title('Iteration: 0 \n Branch Factor: {}'.format(branch_factor))

    # Now go over each level of the tree, and plot the clusters for each branch factor 
    for num_layers in range(max_depth):

        plt.subplot(2, 2, num_layers+2)
        #Plot the data using different colors to visualize the clusters 
        for data_item, tree_item in zip(layers_data[num_layers], layers_tree[num_layers]):
            plt.title('Iteration: {} \n Branch Factor: {}'.format(num_layers+1, branch_factor))
            plt.scatter(data_item.data[:, [0]], data_item.data[:, [1]],s = 20)
            # Plot clusters' centers as well
            plt.scatter(tree_item.data[0], tree_item.data[1], marker='^', color='black')
    plt.suptitle('Hierarchical K-Means for 2D')
    plt.show()

In [12]:
def main():
    
    # Now we can set up a model, and then graph the results: 
    
    branch_factor = 3
    max_depth = 3
    
    #Get random points
    X, y = make_blobs(n_samples=10000, centers=1, n_features=2)
    model1 = MiniBatchKMeans(n_clusters=1)
    model1.fit(X)
    # Get a center of the cluster when all the data used
    first_center = model1.cluster_centers_
    
    # Define model that will be used for constructing 2D tree
    model = MiniBatchKMeans(n_clusters=branch_factor, max_iter=100)
    tree, data = construct2d_tree(X, branch_factor,  first_center,  model, max_depth, depth=0)
    number_of_nodes = count_nodes(tree)
    #Get the number of nodes in the tree
    print('Tree has {} nodes'.format(number_of_nodes))
    #Plot the results 
    my_plot(tree, data, max_depth, branch_factor)