In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import trustworthiness, TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc

from corc.generation import GenerationModel

%matplotlib inline

# References

- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
- https://www.geeksforgeeks.org/implementing-agglomerative-clustering-using-sklearn/
- https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.dendrogram.html

# Equidistant triangle

In [None]:
params = {
    'center_structure': 'equidistant_triangle',
    'n_centers': 3,
    'distance': 1,
    'n_samples': 1000,
    'std':0.01,
    'dim': 2,
}

gen = GenerationModel(**params)

## std = 0.01

In [None]:
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

## std = 0.1

In [None]:
gen.stds = 0.1
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

## std = 0.2

In [None]:
gen.stds = 0.2
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

## std = 0.4

In [None]:
gen.stds = 0.4
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

# More than 3 clusters

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 2,
    'std': 0.01
}

In [None]:
gen = GenerationModel(**params)

## std = 0.01

In [None]:
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 2)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 7)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

## std = 0.05

In [None]:
gen.stds = 0.05
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 5)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

## std = 0.1

In [None]:
gen.stds = 0.1
data = gen.sample_embedding()
plt.scatter(data[:,0], data[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 5)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
plt.scatter(data[:,0], data[:,1], c = ac2.fit_predict(data), cmap ='rainbow')

# More than 2 dimensions

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 3,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'std': 0.2
}

In [None]:
gen = GenerationModel(**params)

## std = 0.2

In [None]:
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

## std = 0.4

In [None]:
gen.stds = 0.4
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

# More clusters and more than 2 dimensions

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'std': 0.1
}

gen = GenerationModel(**params)

## std = 0.1

In [None]:
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

## std = 0.3

In [None]:
gen.stds = 0.3
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

## std = 0.4

In [None]:
gen.stds = 0.4
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

# More clusters and more than 2 dimensions, unequal cluster sizes

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'std': 0.4,
    'equal_sized_clusters': False
}

gen = GenerationModel(**params)

## std = 0.4

In [None]:
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

# More clusters and more than 2 dimensions, unequal stds

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'std': list(np.random.randint(1,6, size=gen.n_centers)/10),
    'equal_sized_clusters': True
}

gen = GenerationModel(**params)

In [None]:
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

In [None]:
gen.stds = list(np.random.randint(1,6, size=gen.n_centers)/10)
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

# More clusters and more than 2 dimensions, unequal cluster sizes, unequal stds

In [None]:
params = {
    'center_structure': 'uniform',
    'n_centers': 6,
    'distance': 1,
    'n_samples': 1000,
    'dim': 32,
    'std': list(np.random.randint(1,6, size=gen.n_centers)/10),
    'equal_sized_clusters': False
}

gen = GenerationModel(**params)

In [None]:
data = gen.sample_embedding()

In [None]:
tsne = TSNE(n_components=2, perplexity=30).fit_transform(data)
plt.scatter(tsne[:,0], tsne[:,1], c=gen.labels)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(data, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(data)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow')

# On Mara's embeddings

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_pickle('/usr/users/pede1/morphology/cluster_vs_continuum/graphdino_morphological_embeddings_tsne.pkl')
df

In [None]:
latents = np.stack(df['latent_emb'].values).astype(float)
tsne = np.stack(df['tsne'].values).astype(float)
colors = np.stack(df['assigned_layer_num'].values)

In [None]:
plt.scatter(tsne[:,0], tsne[:,1], s=1, c=colors)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(latents, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 4)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 9)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 12)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')))

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(tsne, method ='ward')), color_threshold=2200)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 4)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 8)
labels = ac2.fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
k = [3,4,5,6,7,10, 11, 12]
acs = [AgglomerativeClustering(n_clusters = i) for i in k]

In [None]:
# Appending the silhouette scores of the different models to the list
silhouette_scores = [silhouette_score(tsne, ac.fit_predict(tsne)) for ac in acs]

# Plotting a bar graph to compare the results
plt.bar(k, silhouette_scores)
plt.xlabel('Number of clusters', fontsize = 20)
plt.ylabel('S(i)', fontsize = 20)
plt.show()


In [None]:
silhouette_scores

In [None]:
latents.shape

In [None]:
pca = PCA(n_components=10)
pca.fit_transform(latents.T)
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

In [None]:
pcs = pca.components_.T[:,:6]

In [None]:
Dendrogram = shc.dendrogram((shc.linkage(pcs, method ='ward')))

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 3)
labels = ac2.fit_predict(pcs)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 4)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 5)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 6)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)

In [None]:
ac2 = AgglomerativeClustering(n_clusters = 10)
labels = ac2.fit_predict(latents)
plt.scatter(tsne[:,0], tsne[:,1], c = labels, cmap ='rainbow', s=1)