In [1]:
%matplotlib inline
%load_ext autoreload

%autoreload 2
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
import time
from collections import defaultdict
from collections import Counter
import numpy as np
from sklearn.metrics import adjusted_rand_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from copy import deepcopy
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD

from hdbscan import HDBSCAN
import cluster_combinations
import data_helper

In [2]:
X_mnist, y_mnist = rawX, rawy = data_helper.load_mnist(path='../fashion', kind='train')

train_bunch = fetch_20newsgroups(subset="train")
X_news = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
    ]).fit_transform(train_bunch['data'])
y_news = train_bunch['target']


In [None]:
MIN_SAMPLES = 5
MIN_CLUSTER_SIZE = 5
ALPHA_LIST_SIZE = 5
NUM_EXPERIMENTS = 500
BOOTSTRAP_SAMPLE_SIZE = 1000

def get_experimental_results(X, y, alpha_list):

    hdbscan_results_list = []
    for alpha in alpha_list:
        hdbscan_labels = HDBSCAN(
            alpha=alpha,
            min_samples=MIN_SAMPLES,
            min_cluster_size=MIN_CLUSTER_SIZE,
            # Our MHDBSCAN implementation is built on the prims_kdtree algorithm and supports single clusters
            algorithm="prims_kdtree",
            allow_single_cluster=True
        ).fit_predict(X)
        hdbscan_score = adjusted_rand_score(y, hdbscan_labels)
        hdbscan_results_list.append((hdbscan_labels, hdbscan_score))
    (best_hdbscan_labels, best_hdbscan_score) = max(hdbscan_results_list, key=lambda k: k[1])

    mhdbscan_labels = cluster_combinations.MultiscaleHDBSCAN(
        alpha_list=alpha_list, min_samples=MIN_SAMPLES, min_cluster_size=MIN_CLUSTER_SIZE).fit_predict(X)
    mhdbscan_score = adjusted_rand_score(y, mhdbscan_labels)
    return best_hdbscan_score, mhdbscan_score

for dataset_name, X_full, y_full in [("news", X_news, y_news), ("mnist", X_mnist, y_mnist)]:
    num_hbscan_is_better, num_mhbscan_is_better = 0, 0
    while num_mhbscan_is_better + num_hbscan_is_better < NUM_EXPERIMENTS:

        # Bootstrap samples by randomly choosing a dataset and region of alpha hyperparameters
        indices = np.random.permutation(range(X_full.shape[0]))[:BOOTSTRAP_SAMPLE_SIZE]
        alpha_list = np.random.choice(np.linspace(0.001, 3, 1000), ALPHA_LIST_SIZE)

        X = TruncatedSVD(random_state=0).fit_transform(X_full[indices])
        y = y_full[indices]
        try:
            hdbscan_score, mhdbscan_score = get_experimental_results(
                X=X, y=y, alpha_list=alpha_list)
        except ValueError as e:
            # Skip alphas where all trees in the condensed_trees list have no clusters
            continue
        if mhdbscan_score > hdbscan_score:
            num_mhbscan_is_better += 1
        elif hdbscan_score > mhdbscan_score:
            num_hbscan_is_better += 1
        total_experiments = num_mhbscan_is_better + num_hbscan_is_better
        if (total_experiments > 0) and (total_experiments % 10 == 0):
            win_ratio = num_mhbscan_is_better / total_experiments
            standard_error = np.sqrt((win_ratio*(1-win_ratio)) / total_experiments)
            print("({}, {}) {} out of {} completed. MHDBSCAN Win Ratio: {} Standard Error: {}".format(
                BOOTSTRAP_SAMPLE_SIZE,
                dataset_name,
                total_experiments,
                NUM_EXPERIMENTS,
                win_ratio,
                standard_error))


(1000, news) 10 out of 500 completed. MHDBSCAN Win Ratio: 0.6 Standard Error: 0.15491933384829668
(1000, news) 20 out of 500 completed. MHDBSCAN Win Ratio: 0.7 Standard Error: 0.10246950765959599
(1000, news) 30 out of 500 completed. MHDBSCAN Win Ratio: 0.7 Standard Error: 0.08366600265340757
