# Clustering of embeddings -> Cluster or continuum?

See implementation of Hopkins' statistics for Iris dataset

https://github.com/prathmachowksey/Hopkins-Statistic-Clustering-Tendency/blob/master/Hopkins-Statistic-Clustering-Tendency.ipynb

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

## Index for Clustering Tendency

Brian Hopkins (1954) develops a method to determine the type of distribution for his plant data. 

The method calculation is as follows:

- Calculate $\sum P$ from $n$ observations, where $P$ is the distance from a random point (uniform distribution) to the nearest neighbor in $X$.
- Calculate $\sum I$ from $n$ observations, where $I$ is the distance from a  point chosen at random from $X$ to its nearest neighbor in $X$.
- Calculate $A = \sum P / \sum I$
- Calculate $x = A / (1+A)$

---

Lawson and Jurs (1990) describe the Hopkins’ statistic as well (same formula but written a bit easier, see derivation below):

- Sample $n = 5 \%$ of dataset points
- $H = \sum P / (\sum P + \sum I)$

The null hypothesis is that the data is uniformly distributed and therefore is no organization in the data. Therefore, this statistic merely provides evidence that the data has more structure than uniformly distributed random numbers.

This metric can be interpreted as follows:

- if the data contains little structure, $H \approx 0.5$
- if the data is organized in tight clusters, $H \approx 1.0$
- $H > 0.75$ provides a 90% confidence that the data is more clustered than uniformly distributed random numbers, because of the shape of the Beta distribution.
- $H \approx 0.64$ could mislead researchers into concluding that more than one cluster is present

## calculate Hopkins' statistics

In [None]:
# function to compute Hopkins' statistic for ndarray X
def hopkins_statistic(X):

    sample_size = int(X.shape[0] * 0.2) # 0.05 (5%) based on paper by Lawson and Jures
    
    # uniform random sample in the original data space
    X_uniform_random_sample = np.random.uniform(X.min(axis=0), X.max(axis=0), (sample_size, X.shape[1]))
    
    # random sample of size sample_size from the original data X
    random_indices = np.random.randint(0, X.shape[0], (sample_size,))
    X_sample = X[random_indices]
    
    # initialize unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs = neigh.fit(X)
    
    # u_distances = nearest neighbour distances from uniform random sample
    u_distances, u_indices = nbrs.kneighbors(X_uniform_random_sample, n_neighbors=2)
    u_distances = u_distances[:, 0] # distance to the first (nearest) neighbour
    
    # w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample, n_neighbors=2)
    # distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[:, 1] 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    # compute and return Hopkins' statistic
    H = u_sum / (u_sum + w_sum)
    return H
    

## Try on different data

### sample points from uniform distribution with 32 dimensions

In [None]:
n_samples = 1000
dim = 32

n_centers = 10
n_samples_around_c = 100

In [None]:
latent_emb = np.random.uniform(0,1,(n_samples, dim))
latent_emb.shape

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

Hopkins' statistic is 0.5 which indicates uniform distribution which is correct.

## std = 0.01

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.01, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

Hopkins' statistic is close to 1 which is exactly as expected of the metric if the data is strongly clustered as it is.

## std = 0.5

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.5, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

I would have expected a higher value of Hopkins' statistic here since the data is visibly clustered and a value of 0.64 does not necessarily mean clusterable data according to Lawson 1990.

## std = 0.7

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.7, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

The value does not drop compared to clustering before although it is visibly less clustered.

## std = 0.8

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 0.8, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

## std = 1.0

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 1.0, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

## std = 3.0

In [None]:
cluster_centers = np.random.uniform(0,1,size=(n_centers, dim))
latent_emb = []
labels = []

# create data
for ci, c in enumerate(cluster_centers):
    samples = np.random.normal(c, 3.0, size=(n_samples_around_c, dim))
    latent_emb.append(samples)
    labels.append(np.ones(len(samples))*ci)

latent_emb = np.array(latent_emb).reshape(n_centers*n_samples_around_c, -1)
labels = np.array(labels).reshape(n_centers*n_samples_around_c, -1)

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')

## One normal distribution

In [None]:
latent_emb = np.random.normal(0, 1, size=(n_samples, dim))

In [None]:
clustering = pd.DataFrame(TSNE(n_components=2, perplexity=30).fit_transform(latent_emb), columns=['x', 'y'])
sns.scatterplot(data=clustering, x='x', y='y')
plt.show()

In [None]:
print(f'Hopkins statistic: {hopkins_statistic(latent_emb)}')