In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.75)
sns.set_style("white")

from graspologic.cluster import DivisiveCluster

Hierarchical clustering is similar to the clustering algorithms introduced above like AutoGMM and K-Means but it leads to a hierarchy of clusters. Two major types of hierarchical clustering algorithms are agglomerative and divisive. The former one starts from every data point in its own cluster and gradually merges cluters in a "bottom-up" fashion; the latter one assumes all data points in the same cluster initially and gradually divides it in a "top-down" fashion.

This DivisiveCluster algorithm implements hierarchical clustering in a “divisive” approach based on a chosen clustering algorithm such as AutoGMM. It retrieves predictions on the full dataset from the chosen clustering algorithm, say AutoGMM, and passes each subset of data corresponding to a predicted cluster onto AutoGMM again while specifying min_components=1. If the best model computed by AutoGMM for any predicted cluster leads to more than one subcluster, each of the subclusters will be clustered recursively as described above; otherwise, that subcluster becomes a leaf cluster. The algorithm terminates when all branches of recursive clustering have led to a set of leaf clusters.

## Using DivisiveCluster on Synthetic Data

Consider the following synthetic hierarchical data made up of two levels of four Gaussian distributions in 2D. The 4 distributions have the same standard deviation. And their means are chosen so that the difference between the first two is the same as that between the last two. Hence, this dataset can be classified into 4 clusters of 1 Gaussian component or 2 clusters of Gaussian mixtures of 4 components. Those are the two clustering hierarchies of increasing granularity.

In [None]:
# generate synethetic data

np.random.seed(3)

n = 100  # number of data points
d = 2  # number of dimensions

# Let Xij denote the ith Gaussian mixture component in the jth cluster at the lowest hierarchy, i.e., level 2
X11 = np.random.normal(-4.5, 0.5, size=(n, d))
X21 = np.random.normal(-3, 0.5, size=(n, d))
X12 = np.random.normal(3, 0.5, size=(n, d))
X22 = np.random.normal(4.5, 0.5, size=(n, d))
X = np.vstack((X11, X21, X12, X22))

# true label at either level
y_lvl1 = np.repeat([0, 1], 2 * n).reshape((-1, 1))
y_lvl2 = np.repeat([0, 1, 2, 3], n).reshape((-1, 1))
y = np.hstack((y_lvl1, y_lvl2))

In [None]:
# plotting function for clustering
def plot(X, y, title):
    df = pd.DataFrame(np.hstack((X,y)))
    df.columns = ["dim1", "dim2", "level1", "level2"]
    df["level1"] = df["level1"].astype(int)
    df["level2"] = df["level2"].astype(int)
    fig,ax = plt.subplots(1, figsize=(10,10))
    sns.scatterplot(
        data=df, x=df["dim1"], y=df["dim2"], style=df["level1"], hue=df["level2"], palette="deep", ax=ax, legend=False
    )
    ax.set(xticks=[], yticks=[], title=title)
    plt.show()

plot(X, y, "True Clustering")

In [None]:
from sklearn.metrics import adjusted_rand_score
np.random.seed(1)

# fit model and predict on data
dc = DivisiveCluster(max_components=2, cluster_method="gmm", max_level=2)
# choose to return a set of flat clusterings
pred = dc.fit_predict(X, fcluster=True)

# evaluate the clustering performance in terms of ARI 
# (used before in evaluating other clustering methods) 
print("ARI score for model at level 1: %.2f" % adjusted_rand_score(y[:,0], pred[:,0]))
print("ARI score for model at level 2: %.2f" % adjusted_rand_score(y[:,1], pred[:,1]))

plot(X, pred, "DivisiveCluster Assignments")