## General idea
1. Overcluster data using a GMM
2. Construct a weighted undirected graph with the clusters as centers. Low weights mean, that the clusters are more disconnected. As Mara did, we projected the samples of two clusters onto the line connecting the two cluster means and apply the diptest on that. We use the pvalue of the diptest as edge weights for the graph.
So far, there is no cutting edges or merging clusters, just overclustering and plotting the edges with line thickness = pvalue.
3. Plots show tsne on samples and gmm cluster means.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from openTSNE import TSNE
import colorcet as cc
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import kneighbors_graph
import matplotlib.pylab as plt
import diptest

%matplotlib inline

In [None]:
# from Mara's code

def get_knn_dict(means, k=3, thresh=np.inf):
    knn = kneighbors_graph(means, k, mode='distance', include_self=False).toarray()
    knn[knn > thresh] = 0

    knn_dict = {}
    for i in range(len(means)):
        neighbors = np.where(knn[i])[0]
        knn_dict[i] = neighbors
    return knn_dict

def get_pvalue_dict(means, samples, labels, knn_dict):
    pvalue_dict = {}
    for cm, neighs in knn_dict.items():
        pvalue_list = []
        for n in list(neighs):

            cluster1_proj, cluster2_proj = compute_projection(
                cm, n, means, samples, labels
            )
            dip, pvalue = diptest.diptest(np.concatenate([cluster1_proj, cluster2_proj]))
            pvalue_list.append(pvalue)
        pvalue_dict[cm] = pvalue_list
    return pvalue_dict

def compute_projection(cluster1, cluster2, means, latents, predictions):
    c = means[cluster1] - means[cluster2]
    unit_vector = c / np.linalg.norm(c)
    
    points1 = latents[predictions==cluster1]
    points2 = latents[predictions==cluster2]
    
    cluster1_proj = np.dot(points1, unit_vector)
    cluster2_proj = np.dot(points2, unit_vector)

    mean = (np.mean(cluster1_proj) + np.mean(cluster2_proj)) / 2
    
    cluster1_proj -= mean
    cluster2_proj -= mean
    
    return cluster1_proj, cluster2_proj

In [None]:
np.random.seed(114)

## generate toy data

In [None]:
n_total = 10000
n_components = 5
# variance = 0.7
# n_dim = 8

# variance = 1
# n_dim = 8


variance = 5
n_dim = 32

In [None]:
gmm = GaussianMixture(n_components=n_components, covariance_type='diag')
cluster_means = np.random.randn(n_components, n_dim)
gmm.means_ = cluster_means
gmm.covariances_ = np.ones(n_components) * variance
w = np.random.randint(1, 10, size=n_components)
gmm.weights_ = w / sum(w)

samples, gt_labels = gmm.sample(n_samples=n_total)

In [None]:
n_components_fit = 7
gmm_fit = GaussianMixture(
    n_components=n_components_fit,
    covariance_type="full",
)
gmm_fit = gmm_fit.fit(samples)

labels = gmm_fit.predict(samples)

In [None]:
len(gmm_fit.means_)

In [None]:
knn_dict = get_knn_dict(gmm_fit.means_, k=3)

In [None]:
tsne = TSNE(
    perplexity=100,
    metric='euclidean',
    n_jobs=8,
    random_state=42,
    verbose=False,
)
tsne_emb = tsne.fit(np.concatenate([samples, gmm_fit.means_]))

tsne_cluster_means = tsne_emb[-len(gmm_fit.means_):]
tsne_emb = tsne_emb[:-len(gmm_fit.means_)]

We thresholded the neighbor selection by the average distance of all clusters to their third-nearest neighbor.

In [None]:
pvalue_dict = get_pvalue_dict(gmm_fit.means_, samples, labels, knn_dict)

In [None]:
# for c in range(n_components_fit):
    # plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


In [None]:
palette = sns.color_palette(cc.glasbey, n_colors=n_components_fit)

for c in range(n_components_fit):
    plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
# plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


## More toy data

In [None]:
n_total = 1000
n_components = 5

variance = 0.5
n_dim = 8

# generate data
gmm = GaussianMixture(n_components=n_components, covariance_type='diag')
cluster_means = np.random.randn(n_components, n_dim)
gmm.means_ = cluster_means
gmm.covariances_ = np.ones(n_components) * variance
w = np.random.randint(1, 10, size=n_components)
gmm.weights_ = w / sum(w)

samples, gt_labels = gmm.sample(n_samples=n_total)

# fit a gmm
n_components_fit = 10
gmm_fit = GaussianMixture(
    n_components=n_components_fit,
    covariance_type="full",
)
gmm_fit = gmm_fit.fit(samples)

labels = gmm_fit.predict(samples)

# construct graph
knn_dict = get_knn_dict(gmm_fit.means_, k=3)
pvalue_dict = get_pvalue_dict(gmm_fit.means_, samples, labels, knn_dict)

# fit tsne
tsne = TSNE(
    perplexity=100,
    metric='euclidean',
    n_jobs=8,
    random_state=42,
    verbose=False,
)
tsne_emb = tsne.fit(np.concatenate([samples, gmm_fit.means_]))

tsne_cluster_means = tsne_emb[-len(gmm_fit.means_):]
tsne_emb = tsne_emb[:-len(gmm_fit.means_)]

# plot
plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        # print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )
plt.show()
plt.clf()


palette = sns.color_palette(cc.glasbey, n_colors=n_components_fit)
for c in range(n_components_fit):
    plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


## fit GMM with 20 components

In [None]:
n_components_fit = 20

gmm_fit = GaussianMixture(
    n_components=n_components_fit,
    covariance_type="full",
)
gmm_fit = gmm_fit.fit(samples)

labels = gmm_fit.predict(samples)

In [None]:
knn_dict = get_knn_dict(gmm_fit.means_, k=3)

In [None]:
tsne = TSNE(
    perplexity=100,
    metric='euclidean',
    n_jobs=8,
    random_state=42,
    verbose=False,
)
tsne_emb = tsne.fit(np.concatenate([samples, gmm_fit.means_]))

tsne_cluster_means = tsne_emb[-len(gmm_fit.means_):]
tsne_emb = tsne_emb[:-len(gmm_fit.means_)]

In [None]:
pvalue_dict = get_pvalue_dict(gmm_fit.means_, samples, labels, knn_dict)

In [None]:
# for c in range(n_components_fit):
    # plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True, s=3)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


In [None]:
palette = sns.color_palette(cc.glasbey, n_colors=n_components_fit)

for c in range(n_components_fit):
    plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
# plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


## fit GMM with 40 components

In [None]:
np.random.seed(42)

# fit a gmm
n_components_fit = 40
gmm_fit = GaussianMixture(
    n_components=n_components_fit,
    covariance_type="full",
)
gmm_fit = gmm_fit.fit(samples)

labels = gmm_fit.predict(samples)

# construct graph
knn_dict = get_knn_dict(gmm_fit.means_, k=3)
pvalue_dict = get_pvalue_dict(gmm_fit.means_, samples, labels, knn_dict)

# fit tsne
tsne = TSNE(
    perplexity=100,
    metric='euclidean',
    n_jobs=8,
    random_state=42,
    verbose=False,
)
tsne_emb = tsne.fit(np.concatenate([samples, gmm_fit.means_]))

tsne_cluster_means = tsne_emb[-len(gmm_fit.means_):]
tsne_emb = tsne_emb[:-len(gmm_fit.means_)]

# plot
plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True, s=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        # print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )
plt.show()
plt.clf()


palette = sns.color_palette(cc.glasbey, n_colors=n_components_fit)
for c in range(n_components_fit):
    plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
# plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )


## fit GMM with 10 components

In [None]:
np.random.seed(42)

# fit a gmm
n_components_fit = 10
gmm_fit = GaussianMixture(
    n_components=n_components_fit,
    covariance_type="full",
)
gmm_fit = gmm_fit.fit(samples)

labels = gmm_fit.predict(samples)

# construct graph
knn_dict = get_knn_dict(gmm_fit.means_, k=3)
pvalue_dict = get_pvalue_dict(gmm_fit.means_, samples, labels, knn_dict)

# fit tsne
tsne = TSNE(
    perplexity=100,
    metric='euclidean',
    n_jobs=8,
    random_state=42,
    verbose=False,
)
tsne_emb = tsne.fit(np.concatenate([samples, gmm_fit.means_]))

tsne_cluster_means = tsne_emb[-len(gmm_fit.means_):]
tsne_emb = tsne_emb[:-len(gmm_fit.means_)]

# plot
plt.scatter(*tsne_emb.T, alpha=0.4, rasterized=True, s=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        # print(f"{cm=}", f"{n=}", f"{dip:.3f}")

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )
plt.show()
plt.clf()


palette = sns.color_palette(cc.glasbey, n_colors=n_components_fit)
for c in range(n_components_fit):
    plt.scatter(*tsne_emb[labels==c].T, s=3, color=palette[c], alpha=0.4, rasterized=True, label=c)
plt.legend(bbox_to_anchor=(1, 1), markerscale=3)

plt.plot(
    tsne_cluster_means[:, 0],
    tsne_cluster_means[:, 1],
    "o",
    c="black",
    markersize=2,
)

for (cm, neighs), (_, dips) in zip(knn_dict.items(), pvalue_dict.items()):
    for n, dip in zip(list(neighs), list(dips)):
        plt.plot(
            [tsne_cluster_means[cm][0], tsne_cluster_means[n][0]],
            [tsne_cluster_means[cm][1], tsne_cluster_means[n][1]],
            alpha=dip,
            c="black",
        )

        plt.text(
            (tsne_cluster_means[cm][0] + tsne_cluster_means[n][0]) / 2,
            (tsne_cluster_means[cm][1] + tsne_cluster_means[n][1]) / 2,
            f"{dip:.3f}",
            fontsize=8,
            alpha=dip,
        )
