## Hopkins Statistic

The following section includes code taken from Prathma Chowskey's Python implementation of calculating the Hopkins Statistic. His code repository for this can be found here: https://github.com/prathmachowksey/Hopkins-Statistic-Clustering-Tendency.

This analysis may be re-run using the 'hopkins' package in R to ensure that implementation was done correctly.

In [2]:
# import necessary libraries
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from random import sample
from numpy.random import uniform
import os

In [2]:
# function to compute hopkins's statistic for the dataframe X
def hopkins_statistic(X):
    
    X=X.values  #convert dataframe to a numpy array
    sample_size = int(X.shape[0]*0.05) #0.05 (5%) based on paper by Lawson and Jures
    
    
    #a uniform random sample in the original data space
    X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))
    
    
    
    #a random sample of size sample_size from the original data X
    random_indices=sample(range(0, X.shape[0], 1), sample_size)
    X_sample = X[random_indices]
   
    
    #initialise unsupervised learner for implementing neighbor searches
    neigh = NearestNeighbors(n_neighbors=2)
    nbrs=neigh.fit(X)
    
    #u_distances = nearest neighbour distances from uniform random sample
    u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
    u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour
    
    #w_distances = nearest neighbour distances from a sample of points from original data X
    w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
    #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
    w_distances = w_distances[: , 1]
    
 
    
    u_sum = np.sum(u_distances)
    w_sum = np.sum(w_distances)
    
    #compute and return hopkins' statistic
    H = u_sum/ (u_sum + w_sum)
    return H


### Hopkins Statistics for Data Including 0 Scores

In [5]:
os.chdir("C:/Users/Elijah/Desktop/results-including-0-scores")
allData = pd.DataFrame()
for filename in os.listdir(os.getcwd()):
    file = pd.read_csv(filename)
    allData = pd.concat([allData, file])

In [6]:
allData2 = allData.drop(allData.columns[[0,1]], axis=1)

In [7]:
all_zscores = allData2.loc[:, ['Distance Z-score']]

In [8]:
allData2

Unnamed: 0,Year,Semester,Quiz #,Student ID,Coding Problem,Score,Maximum,Distance,Percent,Distance Z-score,Distance Min-Max Scaled
0,2017,fall,quiz06,s159c1ea3,AllCharsExcept,3.0,5,44.719697,0.334183,-0.331719,0.089071
1,2017,fall,quiz06,s1a2bc1e2,AllCharsExcept,5.0,5,37.507576,0.280288,-0.688284,0.015336
2,2017,fall,quiz06,s1a748834,AllCharsExcept,5.0,5,37.681818,0.281590,-0.679670,0.017117
3,2017,fall,quiz06,s204527a1,AllCharsExcept,5.0,5,48.295455,0.360904,-0.154934,0.125629
4,2017,fall,quiz06,s20ffbd50,AllCharsExcept,5.0,5,133.818182,1.000000,4.073288,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
235,2018,fall,quiz11,sf1f60f2f,WriteAndGrade,50.0,50,200.698745,0.335913,-0.738330,0.048055
236,2018,fall,quiz11,sf39b2f6f,WriteAndGrade,50.0,50,221.213389,0.370248,-0.504492,0.097274
237,2018,fall,quiz11,sf80872c6,WriteAndGrade,49.0,50,357.129707,0.597734,1.044761,0.423366
238,2018,fall,quiz11,sfce0219,WriteAndGrade,37.0,50,209.790795,0.351130,-0.634693,0.069868


In [10]:
H = hopkins_statistic(all_zscores)

In [13]:
# for average Hopkins Statistic
l = []
for i in range(20):
    H=hopkins_statistic(all_zscores)
    l.append(H)

np.mean(l)

0.9971425738069264

### Hopkins Statistics for Correct Problems Only

In [14]:
os.chdir("C:/Users/Elijah/Desktop/results-only-full-scores")
fullScoreData = pd.DataFrame()
for filename in os.listdir(os.getcwd()):
    file = pd.read_csv(filename)
    fullScoreData = pd.concat([fullScoreData, file])

In [16]:
fullScoreData2 = fullScoreData.drop(fullScoreData.columns[0], axis=1)

In [18]:
fullscore_zscores = fullScoreData2.loc[:, ['Distance Z-score']]

In [19]:
H = hopkins_statistic(fullscore_zscores)

In [21]:
# for average Hopkins Statistic
l = []
for i in range(20):
    H=hopkins_statistic(all_zscores)
    l.append(H)

np.mean(l)

0.9950216665530455

### Hopkins Statistics for Data Including 0 Score & Within 25-75 Quartile

In [4]:
os.chdir("C:/Users/Elijah/Documents/GitHub/code-to-ast/data-analysis")
quart_data = pd.read_csv("25-75QuartileStudentData.csv")

In [9]:
student_list = quart_data['Student ID'].to_list()

In [18]:
data = allData2[allData2['Student ID'].isin(student_list)]

In [20]:
data_final = data.loc[:, ['Distance Z-score']]

In [22]:
# for average Hopkins Statistic for 25-75 Quartile
l = []
for i in range(20):
    H=hopkins_statistic(data_final)
    l.append(H)

np.mean(l)

0.9894995488997429

In [23]:
hopkins_statistic(data_final)

0.9726714649200251

### Hopkins Statistics for 0-25 & 75-100 Quartiles

In [4]:
os.chdir("C:/Users/Elijah/Documents/GitHub/code-to-ast/data-analysis")
low_quart = pd.read_csv("0-25QuartileStudentData.csv")
high_quart = pd.read_csv("75-100QuartileStudentData.csv")

In [9]:
low_student_list = low_quart['Student ID'].to_list()
high_student_list = high_quart['Student ID'].to_list()
low_data = allData2[allData2['Student ID'].isin(low_student_list)]
high_data = allData2[allData2['Student ID'].isin(high_student_list)]

In [10]:
low_data_final = low_data.loc[:, ['Distance Z-score']]
high_data_final = high_data.loc[:, ['Distance Z-score']]

In [11]:
# for average Hopkins Statistic for 0-25 quartile
l = []
for i in range(20):
    H=hopkins_statistic(low_data_final)
    l.append(H)

np.mean(l)

0.9914030457402532

In [12]:
# for average Hopkins Statistic for 75-100 quartile
l = []
for i in range(20):
    H=hopkins_statistic(high_data_final)
    l.append(H)

np.mean(l)

0.9826367472445213