In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import HDBSCAN

from corc.generation import GenerationModel

%matplotlib inline

# References

- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html
- https://scikit-learn.org/stable/auto_examples/cluster/plot_hdbscan.html#sphx-glr-auto-examples-cluster-plot-hdbscan-py
- https://hdbscan.readthedocs.io/en/latest/parameter_selection.html
- https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html

# Equidistant triangle

In [None]:
params = {
    'center_structure': 'equidistant_triangle',
    'n_centers': 3,
    'distance': 1,
    'n_samples': 1000,
    'dim': 2,
    'save_file': False,
    'outdir': '.'
}

In [None]:
gen = GenerationModel(**params)
gen.generate()

## std = 0.01

In [None]:
std = 0.01
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

In [None]:
clustering.probabilities_[0]

In [None]:
print(f'n clusters = {len(np.unique(clustering.labels_))}')
plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

## std = 0.1

In [None]:
std = 0.1
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

## std = 0.2

In [None]:
std = 0.2
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

## std = 0.5

In [None]:
std = 0.4
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

# More than 3 clusters

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 7,
    'distance': 1,
    'n_samples': 1000,
    'dim': 2,
    'save_file': False,
    'outdir': '.'
}

In [None]:
gen = GenerationModel(**params)
gen.generate()

## std = 0.01

In [None]:
std = 0.01
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

## std = 0.05

In [None]:
std = 0.05
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=3)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
clustering = HDBSCAN(min_cluster_size=4)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

## std = 0.1

In [None]:
std = 0.1
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=7)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
clustering = HDBSCAN(min_cluster_size=6)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

# More than 2 dimensions

In [None]:
from sklearn.manifold import trustworthiness, TSNE

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 3,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'save_file': False,
    'outdir': '.'
}

In [None]:
gen = GenerationModel(**params)
gen.generate()

## std = 0.2

In [None]:
std = 0.2
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_)

## std = 0.4

In [None]:
std = 0.4
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10, min_samples=3)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

# plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_)

# More clusters and more than 2 dimensions

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'save_file': False,
    'outdir': '.'
}

In [None]:
gen = GenerationModel(**params)
gen.generate()

## std = 0.1

In [None]:
std = 0.1
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_)

## std = 0.3

In [None]:
std = 0.3
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=10, min_samples=10)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_)

## std = 0.4

In [None]:
std = 0.4
data = gen.sample_embedding(std=std)
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
clustering = HDBSCAN(min_cluster_size=5, min_samples=3)
clustering.fit(data)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

# plt.scatter(data[:,0], data[:,1], c=clustering.labels_)

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_)

# On Mara's embeddings

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_pickle('../../graphdino_morphological_embeddings_tsne.pkl')
latents = np.stack(df['latent_emb'].values).astype(float)
tsne = np.stack(df['tsne'].values).astype(float)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], s=1)

In [None]:
clustering = HDBSCAN(min_cluster_size=10, min_samples=7, algorithm='balltree')
clustering.fit(latents)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_, s=1)

In [None]:
clustering = HDBSCAN(min_cluster_size=10, min_samples=7, cluster_selection_epsilon=0.7, algorithm='balltree')
clustering.fit(latents)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_, s=1)

In [None]:
clustering = HDBSCAN(min_samples=100, cluster_selection_method='leaf')
clustering.fit(latents)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_, s=1)

In [None]:
clustering = HDBSCAN(min_samples=300, min_cluster_size=7, cluster_selection_method='leaf')
clustering.fit(latents)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_, s=1)

In [None]:
clustering = HDBSCAN(min_samples=300, min_cluster_size=3)
clustering.fit(latents)

print(f'n clusters = {len(np.unique(clustering.labels_))}')

plt.scatter(tsne[:,0], tsne[:,1], c=clustering.labels_, s=1)