# Clustering of embeddings -> Cluster or continuum?

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

# improved Visual Assessment of cluster Tendency (iVAT)

Basically the same as VAT but produces more precise images at a heavier computing cost.

In [None]:
from pyclustertend import ivat

## Try on different data

### sample points from uniform distribution with 32 dimensions

In [None]:
n_samples = 1000
dim = 32

n_centers = 10
n_samples_around_c = 100

In [None]:
latent_emb = np.random.uniform(0,1,(n_samples, dim))
latent_emb.shape

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
ivat(latent_emb)

### sample from data instead of applying it to all

In [None]:
sample_size = 200

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 0.01

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.01, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
ivat(latent_emb)

VAT gives clear blocks for clusters if the clusters are well separated.

## std = 0.3

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.3, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
ivat(latent_emb)

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 0.5

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.5, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
ivat(latent_emb)

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 0.7

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.7, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 0.8

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.8, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 1.0

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 1.0, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)

## std = 3.0

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 3.0, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
rand_indices = np.random.randint(0,latent_emb.shape[0], sample_size)
X_sampled = latent_emb[rand_indices]

ivat(X_sampled)