## Hopkins Statistic

The following section includes code taken from Prathma Chowskey's Python implementation of calculating the Hopkins Statistic. His code repository for this can be found here: https://github.com/prathmachowksey/Hopkins-Statistic-Clustering-Tendency.

This analysis may be re-run using the 'hopkins' package in R to ensure that implementation was done correctly.

In [1]:
# import necessary libraries
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from random import sample
from numpy.random import uniform

In [2]:
# function to compute hopkins's statistic for the dataframe X
def hopkins_statistic(X):
    
    X=X.values  #convert dataframe to a numpy array
    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H


### Hopkins Statistics for Data Including 0 Scores

In [4]:
import os
os.chdir("C:/Users/Elijah/Desktop/results-including-0-scores")
allData = pd.DataFrame()
for filename in os.listdir(os.getcwd()):
    file = pd.read_csv(filename)
    allData = pd.concat([allData, file])

In [6]:
allData2 = allData.drop(allData.columns[[0,1]], axis=1)

In [8]:
all_zscores = allData2.loc[:, ['Distance Z-score']]

In [10]:
H = hopkins_statistic(all_zscores)

In [13]:
# for average Hopkins Statistic
l = []
for i in range(20):
    H=hopkins_statistic(all_zscores)
    l.append(H)

np.mean(l)

0.9971425738069264

### Hopkins Statistics for Correct Problems Only

In [14]:
os.chdir("C:/Users/Elijah/Desktop/results-only-full-scores")
fullScoreData = pd.DataFrame()
for filename in os.listdir(os.getcwd()):
    file = pd.read_csv(filename)
    fullScoreData = pd.concat([fullScoreData, file])

In [16]:
fullScoreData2 = fullScoreData.drop(fullScoreData.columns[0], axis=1)

In [18]:
fullscore_zscores = fullScoreData2.loc[:, ['Distance Z-score']]

In [19]:
H = hopkins_statistic(fullscore_zscores)

In [21]:
# for average Hopkins Statistic
l = []
for i in range(20):
    H=hopkins_statistic(all_zscores)
    l.append(H)

np.mean(l)

0.9950216665530455