## Load libraries

In [1]:
# Libraries to work cross-platform
import os

# Libraries to work with dataset
import numpy as np
import pandas as pd

# Libraries for evaluation
from pyclustertend import hopkins

# Libraries for monitoring operation process
from datetime import datetime
from tqdm import tqdm

## Configurate and declare global variables

In [2]:
os_name = os.name

if os_name == 'nt':
    """Windows platform"""
    BASE_DIR = "E:/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"
elif os_name == 'posix':
    """Linux platform"""
    BASE_DIR = "/media/pinkalinux/WORK/THIENDHB_GOOGLEDRIVE/MASTER TILBURG/THESIS/"
    INPUT_DIR = BASE_DIR + "DATASET/INPUT/"
    OUTPUT_DIR = BASE_DIR + "DATASET/OUTPUT/"

SEED = 6886
%matplotlib inline

## Import data

In [3]:
title_embeddings = np.load(OUTPUT_DIR + "title_embeddings.npy")
title_embeddings.shape

(18992, 300)

## Test cluster tendency

In [4]:
def get_hopkins_stats(X, n):
    """
    Function to run the Hopkins test to test the cluster tendency
    Inputs:
    - X: dataset to test 
    - n: sample size to test
    Return:
    - Hopkins statistics (= 1 - H): Closer to 0 means cluster tendency. 
    - Duration: time spent in minutes to run the Hopkins test
    """
    start_hopkin_time = datetime.now()
    print(
        "n =", n,
        "\nStart computing Hopkins statistic",
        start_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f"),
    )

    h_stat = hopkins(X, n)

    end_hopkin_time = datetime.now()
    print(
        "End computing Hopkins statistic",
        end_hopkin_time.strftime("%Y-%m-%d %H:%M:%S.%f")
    )
    print("Duration", end_hopkin_time - start_hopkin_time)
    print('Hopkins statistics =', h_stat)
    return h_stat, round((end_hopkin_time - start_hopkin_time).seconds/60, 2)

In [8]:
# Test the cluster tendency of data
n_sample_list = [50, 100, 1000, 5000, 10000]
hopkins_list = []
duration_list = []

tqdm_bar = tqdm(desc="Running Hopkins test", total=len(n_sample_list))
for n in n_sample_list:
    hopkins_stat, duration = get_hopkins_stats(title_embeddings, n)
    hopkins_list.append(hopkins_stat)
    duration_list.append(duration)
    tqdm_bar.update(1)
tqdm_bar.close()


Running Hopkins test:   0%|                                                                      | 0/5 [00:00<?, ?it/s][A

n = 50 
Start computing Hopkins statistic 2021-05-17 14:02:38.186160



Running Hopkins test:  20%|████████████▍                                                 | 1/5 [00:03<00:14,  3.57s/it][A

End computing Hopkins statistic 2021-05-17 14:02:41.750163
Duration 0:00:03.564003
Hopkins statistics = 0.051022755850972816
n = 100 
Start computing Hopkins statistic 2021-05-17 14:02:41.754162



Running Hopkins test:  40%|████████████████████████▊                                     | 2/5 [00:07<00:11,  3.93s/it][A

End computing Hopkins statistic 2021-05-17 14:02:45.939158
Duration 0:00:04.184996
Hopkins statistics = 0.062280359369503116
n = 1000 
Start computing Hopkins statistic 2021-05-17 14:02:45.941175



Running Hopkins test:  60%|█████████████████████████████████████▏                        | 3/5 [00:24<00:19,  9.71s/it][A

End computing Hopkins statistic 2021-05-17 14:03:02.531194
Duration 0:00:16.590019
Hopkins statistics = 0.057781820855310695
n = 5000 
Start computing Hopkins statistic 2021-05-17 14:03:02.534195



Running Hopkins test:  80%|█████████████████████████████████████████████████▌            | 4/5 [01:36<00:34, 34.20s/it][A

End computing Hopkins statistic 2021-05-17 14:04:14.269237
Duration 0:01:11.735042
Hopkins statistics = 0.05711136666156168
n = 10000 
Start computing Hopkins statistic 2021-05-17 14:04:14.271195



Running Hopkins test: 100%|██████████████████████████████████████████████████████████████| 5/5 [03:55<00:00, 47.02s/it][A

End computing Hopkins statistic 2021-05-17 14:06:33.275789
Duration 0:02:19.004594
Hopkins statistics = 0.05744909443742822





In [9]:
hopkins_result_df = pd.DataFrame(
    {
        "dataset": ["title_embeddings"]*len(n_sample_list),
        "dataset_size": [title_embeddings.shape]*len(n_sample_list),
        "n": n_sample_list,
        "hopkins_stastitics": hopkins_list,
        "test_duration": duration_list
    }
)

In [10]:
hopkins_result_df

Unnamed: 0,dataset,dataset_size,n,hopkins_stastitics,test_duration
0,title_embeddings,"(18992, 300)",50,0.051023,0 days 00:00:03.564003
1,title_embeddings,"(18992, 300)",100,0.06228,0 days 00:00:04.184996
2,title_embeddings,"(18992, 300)",1000,0.057782,0 days 00:00:16.590019
3,title_embeddings,"(18992, 300)",5000,0.057111,0 days 00:01:11.735042
4,title_embeddings,"(18992, 300)",10000,0.057449,0 days 00:02:19.004594


In [11]:
hopkins_result_df.to_csv(
    OUTPUT_DIR + "title_hopkins_test_result.csv", index=False
)