## Load libraries

In [9]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
import pandas as pd

# Libraries for evaluation
from pyclustertend import hopkins

# Libraries for monitoring operation process
from datetime import datetime
from tqdm import tqdm

## Configurate and declare global variables

In [10]:
os_name = os.name

if os_name == 'nt':
    """Windows platform"""
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"

elif os_name == 'posix':
    """Linux platform"""
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"

INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
RESULT_DIR = BASE_DIR + "RESULTS/"

SEED = 6886
%matplotlib inline

## Import data

In [11]:
skill_embeddings = np.load(OUTPUT_DIR + "skill_feat_halfsize_embeddings.npy")
skill_embeddings.shape

(257205, 150)

## Test cluster tendency

In [12]:
def get_hopkins_stats(X, n):
    """
    Function to run the Hopkins test to test the cluster tendency
    Inputs:
    - X: dataset to test 
    - n: sample size to test
    Return:
    - Hopkins statistics (= 1 - H): Closer to 0 means cluster tendency. 
    - Duration: time spent in minutes to run the Hopkins test
    """
    start_hopkin_time = datetime.now()
    print(
        "n =", n,
        "\nStart computing Hopkins statistic",
        start_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f"),
    )

    h_stat = hopkins(X, n)

    end_hopkin_time = datetime.now()
    print(
        "End computing Hopkins statistic",
        end_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f")
    )
    print("Duration", end_hopkin_time - start_hopkin_time)
    print('Hopkins statistics =', h_stat)
    return h_stat, round((end_hopkin_time - start_hopkin_time).seconds/60, 2)

In [13]:
# Test the cluster tendency of data
n_sample_list = [50, 100, 1000, 5000, 10000]
hopkins_list = []
duration_list = []

tqdm_bar = tqdm(desc="Running Hopkins test", total=len(n_sample_list))
for n in n_sample_list:
    hopkins_stat, duration = get_hopkins_stats(skill_embeddings, n)
    hopkins_list.append(hopkins_stat)
    duration_list.append(duration)
    tqdm_bar.update(1)
tqdm_bar.close()

Running Hopkins test:   0%|                                                                      | 0/5 [00:00<?, ?it/s]

n = 50 
Start computing Hopkins statistic 2021-05-17 22:42:19.167699


Running Hopkins test:  20%|████████████▍                                                 | 1/5 [00:43<02:52, 43.24s/it]

End computing Hopkins statistic 2021-05-17 22:43:02.405016
Duration 0:00:43.237317
Hopkins statistics = 0.024671787817664628
n = 100 
Start computing Hopkins statistic 2021-05-17 22:43:02.411995


Running Hopkins test:  40%|████████████████████████▊                                     | 2/5 [02:47<04:33, 91.01s/it]

End computing Hopkins statistic 2021-05-17 22:45:06.859543
Duration 0:02:04.447548
Hopkins statistics = 0.04004249502577315
n = 1000 
Start computing Hopkins statistic 2021-05-17 22:45:06.863545


Running Hopkins test:  60%|████████████████████████████████████▌                        | 3/5 [10:19<08:31, 255.73s/it]

End computing Hopkins statistic 2021-05-17 22:52:38.612397
Duration 0:07:31.748852
Hopkins statistics = 0.02885201502123857
n = 5000 
Start computing Hopkins statistic 2021-05-17 22:52:38.614396


Running Hopkins test:  80%|████████████████████████████████████████████████▊            | 4/5 [28:39<09:49, 589.19s/it]

End computing Hopkins statistic 2021-05-17 23:10:58.979225
Duration 0:18:20.364829
Hopkins statistics = 0.03082334135271056
n = 10000 
Start computing Hopkins statistic 2021-05-17 23:10:58.981222


Running Hopkins test: 100%|█████████████████████████████████████████████████████████████| 5/5 [54:29<00:00, 653.82s/it]

End computing Hopkins statistic 2021-05-17 23:36:48.259854
Duration 0:25:49.278632
Hopkins statistics = 0.03106992018099733





In [14]:
hopkins_result_df = pd.DataFrame(
    {
        "dataset": ["skill_halfsize_embeddings"]*len(n_sample_list),
        "dataset_size": [skill_embeddings.shape]*len(n_sample_list),
        "n": n_sample_list,
        "hopkins_stastitics": hopkins_list,
        "test_duration": duration_list
    }
)

In [15]:
hopkins_result_df

Unnamed: 0,dataset,dataset_size,n,hopkins_stastitics,test_duration
0,skill_halfsize_embeddings,"(257205, 150)",50,0.024672,0.72
1,skill_halfsize_embeddings,"(257205, 150)",100,0.040042,2.07
2,skill_halfsize_embeddings,"(257205, 150)",1000,0.028852,7.52
3,skill_halfsize_embeddings,"(257205, 150)",5000,0.030823,18.33
4,skill_halfsize_embeddings,"(257205, 150)",10000,0.03107,25.82


In [16]:
hopkins_result_df.to_csv(
    RESULT_DIR + "skill_halfsize_hopkins_test_result.csv", index=False
)