In [2]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from random import sample
from numpy.random import uniform

In [17]:
from sklearn.cluster import KMeans

In [4]:
def hopkins_statistic(X):
    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H
    

In [7]:
embeddings = pd.read_pickle("../data/processed/graph_data.pkl")

In [12]:
X = np.array(embeddings.graph_embedding.to_list())

In [19]:
h_stat = hopkins_statistic(X)
h_stat

0.9781230908605468

In [21]:
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X)

KMeans(n_clusters=3, random_state=0)

In [23]:
centers = kmeans.cluster_centers_

In [25]:
centers

array([[3.90778598e-01, 3.85547740e-01, 3.71681001e-01, ...,
        5.30734484e-05, 5.30722861e-05, 5.28602729e-05],
       [4.58564182e-01, 4.52152940e-01, 4.35176267e-01, ...,
        9.31045096e-05, 9.31563514e-05, 9.28456699e-05],
       [5.71871981e-01, 5.56748757e-01, 5.17239663e-01, ...,
        5.79353645e-04, 5.92751893e-04, 6.04627975e-04]])