## Load libraries

In [5]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
import pandas as pd

# Libraries for evaluation
from pyclustertend import hopkins

# Libraries for monitoring operation process
from datetime import datetime
from tqdm import tqdm

## Configurate and declare global variables

In [2]:
os_name = os.name

if os_name == 'nt':  
    """Windows platform"""
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
elif os_name == 'posix':
    """Linux platform"""
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

SEED = 6886
%matplotlib inline

## Import data

In [3]:
skill_embeddings = np.load(OUTPUT_DIR + "skill_feat_embeddings.npy")
skill_embeddings.shape

(361584, 600)

## Test cluster tendency

In [18]:
def get_hopkins_stats(X, n):
    """
    Function to run the Hopkins test to test the cluster tendency
    Inputs:
    - X: dataset to test 
    - n: sample size to test
    Return:
    - Hopkins statistics (= 1 - H): Closer to 0 means cluster tendency. 
    - Duration: time spent in minutes to run the Hopkins test
    """
    start_hopkin_time = datetime.now()
    print(
        "n =", n,
        "\nStart computing Hopkins statistic",
        start_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f"),
    )

    h_stat = hopkins(X, n)

    end_hopkin_time = datetime.now()
    print(
        "End computing Hopkins statistic",
        end_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f")
    )
    print("Duration", end_hopkin_time - start_hopkin_time)
    print('Hopkins statistics =', h_stat)
    return h_stat, end_hopkin_time - start_hopkin_time

In [11]:
# Test the cluster tendency of data
n_sample_list = [10, 100, 1000, 5000, 10000]
hopkins_list = []
duration_list = []

tqdm_bar = tqdm(desc="Running Hopkins test", total=len(n_sample_list))
for n in n_sample_list:
    hopkins_stat, duration = get_hopkins_stats(skill_embeddings, n)
    hopkins_list.append(hopkins_stat)
    duration_list.append(duration)
    tqdm_bar.update(1)
tqdm_bar.close()

Running Hopkins test:   0%|                                                                      | 0/4 [00:00<?, ?it/s]

Start computing Hopkins statistic 2021-05-02 15:51:44.598395
End computing Hopkins statistic 2021-05-02 15:59:19.297918
Duration 0:07:34.699523
Start computing Hopkins statistic 2021-05-02 15:59:19.345102
End computing Hopkins statistic 2021-05-02 16:07:24.344824
Duration 0:08:04.999722
Start computing Hopkins statistic 2021-05-02 16:07:24.375987
End computing Hopkins statistic 2021-05-02 16:24:31.960523
Duration 0:17:07.584536
Start computing Hopkins statistic 2021-05-02 16:24:31.991765
End computing Hopkins statistic 2021-05-02 17:20:58.546461
Duration 0:56:26.554696


In [13]:
hopkins_result_df = pd.DataFrame(
    {
        "dataset": ["skill_wide_embeddings"]*len(n_sample_list),
        "dataset_size": [skill_embeddings.shape]*len(n_sample_list),
        "n": n_sample_list,
        "hopkins_stastitics": hopkins_list,
        "test_duration": duration_list
    }
)

In [32]:
hopkins_result_df

Unnamed: 0,dataset,dataset_size,n,hopkins_stastitics,test_duration
0,skill_wide_embeddings,"(361584, 600)",10,0.123751,7.34
1,skill_wide_embeddings,"(361584, 600)",100,0.0736615,8.04
2,skill_wide_embeddings,"(361584, 600)",1000,0.0675676,17.04
3,skill_wide_embeddings,"(361584, 600)",5000,0.0677865,56.26
4,skill_wide_embeddings,"(361584, 600)",10000,0.0688456,104.23


In [33]:
hopkins_result_df.to_csv(OUTPUT_DIR + "skill_hopkins_test_result.csv", index=False)