In [None]:
# pip install torchmetrics
# pip install stepmix
# pip install kneed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
from joblib import Parallel, delayed # for parallelization
from itertools import product

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, HDBSCAN
from stepmix.stepmix import StepMix

# Evaluation
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import torch
from torchmetrics.clustering import DunnIndex
from collections import Counter
from kneed import KneeLocator

# Visualization
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

# Preparation
## Data

In [2]:
data2004_i = pd.read_parquet("data/data2004_i.parquet") # load imputed data

# Dataset with numeric outcomes
data_n = data2004_i[[
    'clseusa_n', 'ambornin_n', 'amcit_n', 'amlived_n', 'amenglsh_n', 
     'amchrstn_n', 'amgovt_n', 'amfeel_n', 'amcitizn_n', 'amshamed_n', 
     'belikeus_n', 'ambetter_n', 'ifwrong_n', 'proudsss_n', 'proudgrp_n', 
     'proudpol_n', 'prouddem_n', 'proudeco_n', 'proudspt_n', 'proudart_n', 
     'proudhis_n', 'proudmil_n', 'proudsci_n']]

# Dataset with categorical outcomes
data_f = data2004_i[[
     'clseusa_f', 'ambornin_f', 'amcit_f', 'amlived_f', 'amenglsh_f', 
     'amchrstn_f', 'amgovt_f', 'amfeel_f', 'amcitizn_f', 'amshamed_f', 
     'belikeus_f', 'ambetter_f', 'ifwrong_f', 'proudsss_f', 'proudgrp_f', 
     'proudpol_f', 'prouddem_f', 'proudeco_f', 'proudspt_f', 'proudart_f', 
     'proudhis_f', 'proudmil_f', 'proudsci_f']]

# Dataset with controls
controls = data2004_i[[
    'sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 
    'reltrad_f', 'region_f']]

## Validity indexes

In [3]:
# Custom score functions to avoid throwing errors when undefined
def sil_score(data, pred_clust):
    try:
        sil_score = silhouette_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

def ch_score(data, pred_clust):
    try:
        ch_score = calinski_harabasz_score(data, pred_clust)
    except ValueError:
        ch_score = np.nan
    return ch_score

def db_score(data, pred_clust):
    try:
        db_score = davies_bouldin_score(data, pred_clust)
    except ValueError:
        db_score = np.nan
    return db_score

def dunn_score(data, pred_clust):
    torch_data = np.array(data)
    torch_data = torch.tensor(torch_data, dtype=torch.float32)
    torch_pred_clust = torch.tensor(pred_clust, dtype=torch.int64)

    dunn_metric = DunnIndex()
    
    try:
        dunn_score = float(dunn_metric(torch_data, torch_pred_clust))
    except Exception:
        dunn_score = np.nan
 
    return dunn_score

def inertia(data, labels):
    data = np.asarray(data)
    
    inertia = 0
    for cluster in np.unique(labels):
        cluster_points = data[labels == cluster]
        cluster_centroid = np.mean(cluster_points, axis=0)
        inertia += np.sum((cluster_points - cluster_centroid) ** 2)
        
    return inertia

def clust_size(labels):
    cluster_sizes = Counter(labels)
    min_size = min(cluster_sizes.values())
    max_size = max(cluster_sizes.values())
    
    return min_size, max_size

In [4]:
# Function to return all validity indexes at once
def get_metrics(model, params, n, data, pred_clust, **additional_metrics):
    base_metrics = {
        'model': model,
        'params': params,
        'n_clust': n,
        'min_clust_size': clust_size(pred_clust)[0],
        'max_clust_size': clust_size(pred_clust)[1],
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust),
        'dunn': dunn_score(data, pred_clust),
        'inertia': inertia(data, pred_clust)
    }

    base_metrics.update(additional_metrics)
    return base_metrics

In [5]:
# Function to display the optimal numbers of clutsters according to each validity index
def elbow_plot(df, val_index):
    res = df.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index in ['davies_bouldin', 'entropy']:
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')

    plt.figure(figsize=(8, 4))
    plt.plot(x, y, marker="o", linestyle="-", label=val_index)
    plt.axvline(x=knee_locator.knee, color="r", linestyle="--", label=f"Optimal k={knee_locator.knee}")
    plt.xlabel("Number of Clusters")
    plt.ylabel(f"{val_index} index")
    plt.title(f"Elbow Method for {val_index} index")
    plt.legend()
    plt.show()

## Parameters

In [6]:
max_clust = 12
max_threads = 8

val_indexes = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn', 'inertia']

# Latent models
With the StepMix package, see: https://github.com/Labo-Lacourse/stepmix

In [None]:
# Parameters
clust_range = range(1, max_clust+1)

opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 2500,
}

In [None]:
# Without covariates
def do_StepMix(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        latent_mod = StepMix(
            n_components = n, 
            measurement = type, 
            n_init = 3,
            init_params = 'kmeans',
            structural_params = opt_params,
            random_state = 123)
        
        latent_mod.fit(data)
        pred_clust = latent_mod.predict(data)

        model = 'LCA' if type == 'categorical' else 'LPA'
        params = 'without covariates'
        loglik = latent_mod.score(data)
        aic = latent_mod.aic(data)
        bic = latent_mod.aic(data)
        entropy = latent_mod.entropy(data)

    return get_metrics(model, params, n, data, pred_clust, LL = loglik, aic = aic, bic = bic, entropy = entropy)

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_results = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'categorical', data) for n in clust_range)
LCA_all = pd.DataFrame(cat_results)

# Data preprocessing?
num_results = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'continuous', data_n) for n in clust_range)
LPA_all = pd.DataFrame(num_results)

In [None]:
for val_index in val_indexes + ['aic', 'bic', 'entropy']:
    elbow_plot(LCA_all, val_index)

In [None]:
# With covariates
def do_StepMix_covar(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)
        
        latent_mod = StepMix(
            n_components = n,
            measurement = type,
            n_init = 3,
            init_params = 'kmeans',
            structural = 'covariate', 
            n_steps = 1,
            structural_params = opt_params,
            random_state = 123)
        
        latent_mod.fit(data, controls_dum)
        pred_clust = latent_mod.predict(data)
        
        model = 'LCA' if type == 'categorical' else 'LPA'
        params = 'with covariates'
        loglik = latent_mod.score(data)
        aic = latent_mod.aic(data)
        bic = latent_mod.aic(data)
        entropy = latent_mod.entropy(data)

    return get_metrics(model, params, n, data, pred_clust, LL = loglik, aic = aic, bic = bic, entropy = entropy)

controls_dum = pd.get_dummies(controls)

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_results = Parallel(n_jobs=max_threads)(delayed(do_StepMix_covar)(n, 'categorical', data) for n in clust_range)
LCA_covar_all = pd.DataFrame(cat_results)

# Data preprocessing?
num_results = Parallel(n_jobs=max_threads)(delayed(do_StepMix_covar)(n, 'continuous', data_n) for n in clust_range)
LPA_covar_all = pd.DataFrame(num_results)

## Best latent models

In [None]:
# How to select models based on aic / bic: using their absolute minimum, or an elbow method?
# Absolute minimum yields the model with the most classes, so not appropriate
LCA_aic_min = LCA_all.sort_values('aic', ascending=True).iloc[0]
LCA_bic_min = LCA_all.sort_values('bic', ascending=True).iloc[0]

LPA_aic_min = LPA_all.sort_values('aic', ascending=True).iloc[0]
LPA_bic_min = LPA_all.sort_values('bic', ascending=True).iloc[0]

abs_fit = pd.DataFrame([LCA_aic_min, LCA_bic_min, LPA_aic_min, LPA_bic_min])
abs_fit = abs_fit.drop_duplicates().reset_index(drop=True)
abs_fit

In [None]:
# Find best models according to relative fit = LRT / BLRT / BVR (LCA only)

In [None]:
# Find best model through the Elbow method
def elbow_method(df, val_index):
    res = df.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index in ['davies_bouldin', 'entropy']:
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

models = [LCA_all, LPA_all] + [LCA_covar_all, LPA_covar_all]

params = product(models, val_indexes + ['aic', 'bic', 'entropy'])

latent_elbow = pd.DataFrame()
for model, val_index in params:
    best_model = elbow_method(model, val_index)
    latent_elbow = pd.concat([latent_elbow, best_model], ignore_index=True)

In [None]:
# Find absolute best models for each validity index
latent_elbow = latent_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = latent_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = latent_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = latent_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = latent_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = latent_elbow.sort_values('inertia', ascending=False).iloc[0]

best_aic = latent_elbow.sort_values('aic', ascending=True).iloc[0]
best_bic = latent_elbow.sort_values('bic', ascending=True).iloc[0]
best_entropy = latent_elbow.sort_values('entropy', ascending=False).iloc[0]

latent_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
latent_best = latent_best.drop_duplicates().reset_index(drop=True)

In [None]:
latent_best

The inclusion of covariates makes almost no difference.

All selected models have 3-4 clusters

The best model overall seems to be the LPA one.
- It has lower entropy, meaning it classifies the individuals with better certainty.
- It has lower aic and bic, meaning better model fit.

# K-means

## Flexible kmeans

In [14]:
from scipy.spatial.distance import cdist

class FlexibleKMeans:
    """
    K-Means implementation supporting different distance metrics and center computation methods.
    
    Parameters:
    -----------
    n_clusters : int
        Number of clusters
    metric : str, default='euclidean'
        Distance metric: 'euclidean', 'manhattan', 'chebyshev'
    center_method : str, default='mean'
        Method to compute cluster centers: 'mean' (centroid), 'median', 'medoid'
    max_iter : int, default=100
        Maximum number of iterations
    random_state : int or None, default=None
        Random state for reproducibility
    """
    
    def __init__(self, n_clusters, metric='euclidean', center_method='mean', 
                 max_iter=100, random_state=None):
        self.n_clusters = n_clusters
        self.metric = metric
        self.center_method = center_method
        self.max_iter = max_iter
        self.random_state = random_state
        
        # Define mapping from user-friendly names to scipy metrics
        self.metric_mapping = {
            'euclidean': 'euclidean',
            'manhattan': 'cityblock',
            'chebyshev': 'chebyshev'
        }
        
        # Validate inputs
        valid_metrics = list(self.metric_mapping.keys())
        if metric not in valid_metrics:
            raise ValueError(f"metric must be one of {valid_metrics}")
            
        valid_centers = ['mean', 'median', 'medoid']
        if center_method not in valid_centers:
            raise ValueError(f"center_method must be one of {valid_centers}")
    
    def _compute_distances(self, X, centers):
        """Compute distances between points and centers using specified metric."""
        return cdist(X, centers, metric=self.metric_mapping[self.metric])
    
    def _compute_centers(self, X, labels):
        """Compute new centers using specified method."""
        new_centers = np.zeros((self.n_clusters, X.shape[1]))
        
        for i in range(self.n_clusters):
            cluster_points = X[labels == i]
            
            if len(cluster_points) == 0:
                continue
                
            if self.center_method == 'mean':
                new_centers[i] = np.mean(cluster_points, axis=0)
            
            elif self.center_method == 'median':
                new_centers[i] = np.median(cluster_points, axis=0)
            
            elif self.center_method == 'medoid':
                # For medoid, find the point that minimizes sum of distances to other points
                distances = self._compute_distances(cluster_points, cluster_points)
                medoid_idx = np.argmin(np.sum(distances, axis=1))
                new_centers[i] = cluster_points[medoid_idx]
        
        return new_centers
    
    def fit(self, X):
        """
        Fit the model to the data.
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            Training data (numpy array or pandas DataFrame)
        
        Returns:
        --------
        self : object
            Fitted estimator
        """
        # Convert pandas DataFrame to numpy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        X = np.asarray(X)
        
        if self.random_state is not None:
            np.random.seed(self.random_state)
            
        # Initialize centers randomly
        idx = np.random.choice(len(X), self.n_clusters, replace=False)
        self.cluster_centers_ = X[idx].copy()
        
        for iteration in range(self.max_iter):
            # Store old centers for convergence check
            old_centers = self.cluster_centers_.copy()
            
            # Assign points to nearest center
            distances = self._compute_distances(X, self.cluster_centers_)
            self.labels_ = np.argmin(distances, axis=1)
            
            # Update centers
            self.cluster_centers_ = self._compute_centers(X, self.labels_)
            
            # Check for convergence
            if np.allclose(old_centers, self.cluster_centers_):
                self.n_iter_ = iteration + 1
                break
        else:
            self.n_iter_ = self.max_iter
            
        # Compute final inertia (sum of squared distances to closest center)
        self.inertia_ = np.sum(np.min(distances, axis=1) ** 2)
        
        return self
    
    def fit_predict(self, X):
        """
        Fit the model and return cluster labels.
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            Training data
            
        Returns:
        --------
        labels : array of shape (n_samples,)
            Index of the cluster each sample belongs to
        """
        self.fit(X)
        return self.labels_
    
    def predict(self, X):
        """
        Predict the closest cluster for each sample in X.
        
        Parameters:
        -----------
        X : array-like of shape (n_samples, n_features)
            New data to predict (numpy array or pandas DataFrame)
            
        Returns:
        --------
        labels : array of shape (n_samples,)
            Index of the cluster each sample belongs to
        """
        # Convert pandas DataFrame to numpy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
        X = np.asarray(X)
        
        distances = self._compute_distances(X, self.cluster_centers_)
        return np.argmin(distances, axis=1)

In [15]:
def do_kmeans(n, dist, link):
    kmeans = FlexibleKMeans(
        n_clusters = n,
        metric = dist,
        center_method = link,
        random_state = 42)

    pred_clust = kmeans.fit_predict(data)
    
    model = 'kmeans'
    params = f"distance = {dist}, linkage = {link}"
    
    return get_metrics(model, params, n, data, pred_clust)

scaler = StandardScaler()
data = scaler.fit_transform(data_n)

clust_range = range(1, max_clust+1)
distances = ['euclidean', 'manhattan', 'chebyshev']
linkages = ['mean', 'median', 'medoid']
params = product(clust_range, distances, linkages)

results = Parallel(n_jobs=max_threads)(delayed(do_kmeans)(n, dist, link) for n, dist, link in params)
kmeans_all = pd.DataFrame(results)

In [16]:
# Find best model for each combination of parameters through the Elbow method
def elbow_method(dist, link, val_index):
    params = f"distance = {dist}, linkage = {link}"
    res = kmeans_all[kmeans_all['params'] == params]
    
    res = res.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

kmeans_elbow = pd.DataFrame()

distances = ['euclidean', 'manhattan', 'chebyshev']
linkages = ['mean', 'median', 'medoid']
models = product(distances, linkages)

for dist, link in models:
    for val_index in val_indexes:
        best_mod = elbow_method(dist, link, val_index)
        kmeans_elbow = pd.concat([kmeans_elbow, best_mod], ignore_index=True)

In [17]:
# Find absolute best models for each validity index
kmeans_elbow = kmeans_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = kmeans_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = kmeans_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = kmeans_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = kmeans_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = kmeans_elbow.sort_values('inertia', ascending=False).iloc[0]

kmeans_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
kmeans_best = kmeans_best.drop_duplicates().reset_index(drop=True)

In [18]:
kmeans_best

Unnamed: 0,model,params,n_clust,min_clust_size,max_clust_size,silhouette,calinski_harabasz,davies_bouldin,dunn,inertia
0,kmeans,"distance = manhattan, linkage = mean",3,117,980,0.242334,142.351508,2.403297,0.403745,22629.298954
1,kmeans,"distance = manhattan, linkage = medoid",2,277,938,0.231802,213.129413,2.080685,0.420379,23768.730029
2,kmeans,"distance = chebyshev, linkage = median",2,259,956,-0.018157,30.503825,4.555947,0.144715,27259.493964


## Simple kmeans

In [32]:
def do_kmeans(n):
    kmeans = KMeans(
        n_clusters=n,
        init='k-means++',
        n_init=25,
        random_state=42)
    pred_clust = kmeans.fit_predict(data)
    
    model = 'kmeans'
    params = 'centroid'
    
    return get_metrics(model, params, n, data, pred_clust)

scaler = StandardScaler()
data = scaler.fit_transform(data_n)

clust_range = range(1, max_clust+1)

results = Parallel(n_jobs=max_threads)(delayed(do_kmeans)(n) for n in clust_range)
kmeans_all = pd.DataFrame(results)

In [None]:
# Find best model for each combination of parameters through the Elbow method
def elbow_method(val_index):
    res = kmeans_all.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

kmeans_elbow = pd.DataFrame()

for val_index in val_indexes:
    best_mod = elbow_method(val_index)
    kmeans_elbow = pd.concat([kmeans_elbow, best_mod], ignore_index=True)

In [None]:
# Find absolute best models for each validity index
kmeans_elbow = kmeans_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = kmeans_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = kmeans_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = kmeans_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = kmeans_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = kmeans_elbow.sort_values('inertia', ascending=False).iloc[0]

kmeans_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
kmeans_best = kmeans_best.drop_duplicates().reset_index(drop=True)

In [None]:
kmeans_best

# AHC

In [None]:
# Fit the models
def do_AHC(n, dist, link):
    ahc = AgglomerativeClustering(
        n_clusters = n,
        metric = dist,
        linkage = link)
    
    ahc.fit(data)
    pred_clust = ahc.labels_

    model = 'AHC'
    params = f"distance = {dist}, linkage = {link}"

    return get_metrics(model, params, n, data, pred_clust)

scaler = StandardScaler()
data = scaler.fit_transform(data_n)

clust_range = range(1, max_clust+1)
distances = ['manhattan', 'euclidean', 'chebyshev']
linkages = ['single', 'average', 'complete']
params = product(clust_range, distances, linkages)

results = Parallel(n_jobs=max_threads)(delayed(do_AHC)(n, dist, link) for n, dist, link in params)
results.extend([do_AHC(n, 'euclidean', 'ward') for n in clust_range])
ahc_all = pd.DataFrame(results)

In [None]:
# Find best model for each combination of parameters through the Elbow method
def elbow_method(dist, link, val_index):
    params = f"distance = {dist}, linkage = {link}"
    res = ahc_all[ahc_all['params'] == params]
    
    res = res.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

ahc_elbow = pd.DataFrame()

distances = ['manhattan', 'euclidean', 'chebyshev']
linkages = ['single', 'average', 'complete']
models = product(distances, linkages)

for dist, link in models:
    for val_index in val_indexes:
        best_mod = elbow_method(dist, link, val_index)
        ahc_elbow = pd.concat([ahc_elbow, best_mod], ignore_index=True)

In [None]:
# Find absolute best models for each validity index
ahc_elbow = ahc_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = ahc_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = ahc_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = ahc_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = ahc_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = ahc_elbow.sort_values('inertia', ascending=False).iloc[0]

ahc_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
ahc_best = ahc_best.drop_duplicates().reset_index(drop=True)

In [None]:
ahc_best

AHC yields only one interesting model, where the smallest cluster is not nearly empty. This model have 4 clusters. But its biggest cluster gathers 85 % of the individuals, meaning the others are really small.

# HDBSCAN

In [None]:
# Fit the models
def do_hdbscan(dist, min_c, min_s):
    hdb = HDBSCAN(
        metric = dist,
        min_cluster_size = min_c, 
        min_samples = min_s)
        
    pred_clust = hdb.fit_predict(data)

    model = 'HDBSCAN'
    params = f"distance = {dist}, min_cluster_size = {min_c}, min_samples = {min_s}"
    n = len(set(pred_clust[pred_clust != -1]))
    noise_freq = 100 * sum(pred_clust == -1) / len(pred_clust)

    return get_metrics(model, params, n, data, pred_clust, noise = noise_freq)

scaler = StandardScaler()
data = scaler.fit_transform(data_n)

distances = ['euclidean', 'chebyshev']
min_cluster_sizes = range(2, 16)
min_samples_range = range(1, 16)
params = product(distances, min_cluster_sizes, min_samples_range)

results = Parallel(n_jobs=max_threads)(delayed(do_hdbscan)(dist, min_c, min_s) for dist, min_c, min_s in params)
hdbscan_all = pd.DataFrame(results)

In [None]:
# The Elbow method is inapplicable here. We simply select the model maximizing each validity index.
best_silhouette = hdbscan_all.sort_values('silhouette', ascending=False).iloc[0]
best_ch = hdbscan_all.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = hdbscan_all.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = hdbscan_all.sort_values('dunn', ascending=False).iloc[0]
best_inertia = hdbscan_all.sort_values('inertia', ascending=False).iloc[0]

hdbscan_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
hdbscan_best = hdbscan_best.drop_duplicates().reset_index(drop=True)

In [None]:
hdbscan_best

HDBSCAN clusters all the individuals together. Based on density, there is only one cluster.

In [None]:
# Histogram of the number of clusters selected by HDBSCAN modelsgrouping the values above 15
bins = list(range(1, 16)) + [15.5]
labels = list(range(1, 15)) + ['15+']
plot_data = hdbscan_all['n_clust'].apply(lambda x: x if x <= 15 else 15.5)

plt.figure(figsize=(8, 4))
plt.hist(plot_data, bins=bins, edgecolor='black', align='left', rwidth=0.8)
plt.xticks(bins[:-1], labels)
plt.xlabel('Number of Clusters')
plt.ylabel('Number of Models')
plt.title('Number of clusters selected by HDBSCAN models')
plt.show()

In [None]:
# Fitting one of the best models for n=2
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

hdb = HDBSCAN(metric = 'euclidean', min_cluster_size = 4, min_samples = 2)  
pred_clust = hdb.fit_predict(data)
n_clusters = len(set(pred_clust[pred_clust != -1]))

# Plotting datapoints and clusters in 2D space
plt.figure(figsize=(8, 6))

## PCA to define the 2D space
pca = PCA(n_components=2)
reduced_space = pca.fit_transform(data)

## Create hulls around clusters
hull_vertices = []
hull_colors = []
for i in range(n_clusters):
    cluster_points = reduced_space[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

## Plot datapoints
scatter = plt.scatter(reduced_space[:, 0], reduced_space[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

## Plot hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.7,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()

# Aggregate and display results

In [None]:
best_mod_list = [kmeans_best, ahc_best, hdbscan_best]
best_models = pd.concat(best_mod_list, ignore_index=True)

In [None]:
best_models

In [None]:
# Selecting the best performing model on each criteria across model classes eliminates hdbscan models
# Which could mean hdbscan is underperforming
# Or is picking non-convex clusters
# Or that data is non-clusterable!
best_mod_list = [kmeans_best, ahc_best, hdbscan_best]
best_models = pd.concat(best_mod_list, ignore_index=True)

best_silhouette = best_models.sort_values('silhouette', ascending=False).iloc[0]
best_ch = best_models.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = best_models.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = best_models.sort_values('dunn', ascending=False).iloc[0]
best_inertia = best_models.sort_values('inertia', ascending=False).iloc[0]

best_models = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
best_models = best_models.drop_duplicates().reset_index(drop=True)

In [None]:
best_models

In [None]:
# Histogram
bins = np.arange(best_models['n_clust'].min() - 0.5, best_models['n_clust'].max() + 1.5, 1)

plt.figure(figsize=(8, 4))
plt.hist(best_models['n_clust'], bins=bins, edgecolor='black', rwidth=0.8)
plt.xlabel('Number of Clusters')
plt.ylabel('Number of Models')
plt.title('Optimal number of clusters according to best models')
plt.show()

# Clusters visualization 

In [None]:
# PCA to represent the clusters in 2D
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(data)

In [None]:
# Fit an arbitrary kmeans model
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

kmeans = KMeans(n_clusters=7, random_state=42)
pred_clust = kmeans.fit_predict(data)

### Datapoints alone

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=pred_clust, cmap='tab10', s=20, edgecolors='k')
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.axhline(y=0, color='#333333', linestyle='--', linewidth=1)
plt.axvline(x=0, color='#333333', linestyle='--', linewidth=1)
plt.title("Clusters")
plt.show()

### With decision boundaries

In [None]:
# Create a grid for boundary visualization in 2D space
x_min, x_max = X_reduced[:, 0].min() - 0.5, X_reduced[:, 0].max() + 0.5
y_min, y_max = X_reduced[:, 1].min() - 0.5, X_reduced[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))

# Project grid points back to original space
grid_points_2D = np.c_[xx.ravel(), yy.ravel()]
grid_points_original = pca.inverse_transform(grid_points_2D)

# Predict clusters in the original space
grid_clusters = kmeans.predict(grid_points_original).reshape(xx.shape)

plt.figure(figsize=(8, 6))

# Create scatter plot first to get the color mapping
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot boundaries using the same colormap and normalization
plt.contourf(xx, yy, grid_clusters, 
             alpha=0.3, 
             cmap=scatter.cmap,
             norm=scatter.norm)

# Plot centroids with labels
centroids_pca = pca.transform(kmeans.cluster_centers_)
for i, (x, y) in enumerate(centroids_pca):
    plt.text(x, y, str(i), color='white', fontsize=12, 
             ha='center', va='center', fontweight='bold',
             bbox=dict(facecolor='black', edgecolor='none', boxstyle='round,pad=0.2'))

plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Decision Boundaries")
plt.show()

### With convex hulls

In [None]:
plt.figure(figsize=(8, 6))

# Collect all hull vertices
hull_vertices = []
hull_colors = []
for i in range(kmeans.n_clusters):
    cluster_points = X_reduced[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

# Plot datapoints
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot all hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.3,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()

# Hopkins Statistic

Function from the pyclustertend package, which could not be installed because its depencies are outdated.
See: https://pyclustertend.readthedocs.io/en/latest/_modules/pyclustertend/hopkins.html

In [None]:
from sklearn.neighbors import BallTree

def hopkins(data_frame, sampling_size):
    """Assess the clusterability of a dataset. A score between 0 and 1, a score around 0.5 express
    no clusterability and a score tending to 0 express a high cluster tendency.

    Parameters
    ----------
    data_frame : numpy array
        The input dataset
    sampling_size : int
        The sampling size which is used to evaluate the number of DataFrame.

    Returns
    ---------------------
    score : float
        The hopkins score of the dataset (between 0 and 1)
    """
    
    if type(data_frame) == np.ndarray:
        data_frame = pd.DataFrame(data_frame)

    # Sample n observations from D:P
    if sampling_size > data_frame.shape[0]:
        raise Exception(
            'The number of sample of sample is bigger than the shape of D')

    data_frame_sample = data_frame.sample(n=sampling_size)

    # Get the distance to their neirest neighbors in D:X
    tree = BallTree(data_frame, leaf_size=2)
    dist, _ = tree.query(data_frame_sample, k=2)
    data_frame_sample_distances_to_nearest_neighbours = dist[:, 1]

    # Randomly simulate n points with the same variation as in D:Q
    max_data_frame = data_frame.max()
    min_data_frame = data_frame.min()

    uniformly_selected_values_0 = np.random.uniform(min_data_frame[0], max_data_frame[0], sampling_size)
    uniformly_selected_values_1 = np.random.uniform(min_data_frame[1], max_data_frame[1], sampling_size)

    uniformly_selected_observations = np.column_stack((uniformly_selected_values_0, uniformly_selected_values_1))
    if len(max_data_frame) >= 2:
        for i in range(2, len(max_data_frame)):
            uniformly_selected_values_i = np.random.uniform(min_data_frame[i], max_data_frame[i], sampling_size)
            to_stack = (uniformly_selected_observations, uniformly_selected_values_i)
            uniformly_selected_observations = np.column_stack(to_stack)

    uniformly_selected_observations_df = pd.DataFrame(uniformly_selected_observations)

    # Get the distance to their neirest neighbors in D:Y
    tree = BallTree(data_frame, leaf_size=2)
    dist, _ = tree.query(uniformly_selected_observations_df, k=1)
    uniformly_df_distances_to_nearest_neighbours = dist

    # Return the hopkins score
    x = sum(data_frame_sample_distances_to_nearest_neighbours)
    y = sum(uniformly_df_distances_to_nearest_neighbours)

    if x + y == 0:
        raise Exception('The denominator of the hopkins statistics is null')

    return x / (x + y)[0]

In [None]:
float(hopkins(data_n.values, data_n.shape[0]))