This notebook searches for the best parameters for the non-parametric hierarchical density-based HDBSCAN clustering algorithm.

In [20]:
import sys
import time
import pandas as pd
sys.path.append('../')
from hdbscan import HDBSCAN
from sklearn.model_selection import ParameterGrid
from functions.clustering import silhouette_scorer

In [21]:
# read data
data = pd.read_pickle('../data/clustering_input/clustering_df_daily_clean.pkl')

In [22]:
# prepare data for clustering (store and then remove id)
user_id = data['id']
dates = data['date']
data.drop(columns=['id', 'date'], inplace=True)

In [23]:
# specify parameters and distributions to sample from
param_dist = {'min_samples': [7, 40, 83, 200, 400], 'min_cluster_size': [65, 400, 830, 2000, 4000]}  # for daily data
# param_dist = {'min_samples': [60, 550, 1600, 4000], 'min_cluster_size': [600, 5500, 16000, 40000]}  # for hourly data

In [None]:
# performing grid search
results = []
start = time.time()
print("Performing grid search")
for params in ParameterGrid(param_dist):
    print("for parameters:", params)
    model = HDBSCAN(**params)
    score = silhouette_scorer(model, data)
    results.append((params, score))
print("finished after", time.time() - start)

best_params, best_score = max(results, key=lambda x: x[1])
print("Best parameters:", best_params)
print("Best silhouette score:", best_score)