# Clustering of embeddings -> Cluster or continuum?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

## Get data

### sample points from uniform distribution with 32 dimensions

In [None]:
n_samples = 1000
dim = 32

n_centers = 10
n_samples_around_c = 100

## uniform

In [None]:
latent_emb = np.random.uniform(0,1,(n_samples, dim))
latent_emb.shape

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=2): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=10)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=10): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=20)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=20): {silhouette_score(latent_emb, label)}')

## std = 0.01

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.01, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=2): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=10)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=10): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=20)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=20): {silhouette_score(latent_emb, label)}')

## std = 0.3

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.3, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=2): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=10)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=10): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=20)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=20): {silhouette_score(latent_emb, label)}')

## std = 0.4

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.4, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=2): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=10)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=10): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=20)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=20): {silhouette_score(latent_emb, label)}')

## std = 0.5

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.5, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
KMean= KMeans(n_clusters=2)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=2): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=10)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=10): {silhouette_score(latent_emb, label)}')

KMean= KMeans(n_clusters=20)
KMean.fit(latent_emb)
label=KMean.predict(latent_emb)

print(f'Silhouette Score(n=20): {silhouette_score(latent_emb, label)}')

Score for not so clear distribution is very low, but still the score of the correct number of clusters is higher than for the wrong number of clusters.