# HDBSCAN tuning

In [1]:
!pip install hdbscan

Collecting hdbscan
  Using cached hdbscan-0.8.27-cp36-cp36m-linux_x86_64.whl
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.27
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


## Data Loading and Preprocessing

In [2]:
import pandas as pd
import hdbscan
from sklearn import model_selection
from sklearn.metrics import make_scorer
import logging # to further silence deprecation warnings

## Prepare data

In [3]:
feature_df = pd.read_csv("data/x_train_transformed_df.csv")  # using pre-processed training dataset
print( feature_df.shape )
feature_df.head(3)

(3968, 14)


Unnamed: 0,-7.558620396262513e-12,-2.5195401320875045e-12,0.04186291447346667,0.04262949171029229,0.04262949171029229.1,0.0,0.0.1,0.0.2,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8
0,-7.55862e-12,-2.51954e-12,0.041863,0.042629,0.042629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-7.55862e-12,-2.51954e-12,0.041863,0.042629,0.042629,3.425587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-7.55862e-12,-2.51954e-12,0.041863,0.042629,0.042629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#---------------------------------------------------------------------------------------------------------------------------------------

## Find optimal parameters

In [4]:
logging.captureWarnings(True)
hdb = hdbscan.HDBSCAN(gen_min_span_tree=True)

In [5]:
param_dist = {'min_samples': [10,20,30,50],
              'min_cluster_size':[40, 50,100],  
              'cluster_selection_method' : ['eom','leaf'],
              'metric' : ['euclidean','manhattan'] 
             }

In [6]:
validity_scorer = make_scorer(hdbscan.validity.validity_index,greater_is_better=True)

In [7]:
grid_search = model_selection.GridSearchCV(hdb, 
                                           param_dist,
                                           scoring=validity_scorer, 
                                           refit=True,
                                           cv=10, 
                                           return_train_score=True)

In [8]:
grid_search.fit(feature_df)

GridSearchCV(cv=10, estimator=HDBSCAN(gen_min_span_tree=True),
             param_grid={'cluster_selection_method': ['eom', 'leaf'],
                         'metric': ['euclidean', 'manhattan'],
                         'min_cluster_size': [40, 50, 100],
                         'min_samples': [10, 20, 30, 50]},
             return_train_score=True, scoring=make_scorer(validity_index))

In [9]:
print(f"Best Parameters {grid_search.best_params_}")
print(f"DBCV score :{grid_search.best_estimator_.relative_validity_}")    # DBCV: https://github.com/christopherjenness/DBCV

Best Parameters {'cluster_selection_method': 'eom', 'metric': 'euclidean', 'min_cluster_size': 40, 'min_samples': 10}
DBCV score :0.9489197777196966
