In [None]:
# pip install torchmetrics
# pip install stepmix
# pip install kneed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
from joblib import Parallel, delayed # for parallelization
from itertools import product

# Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, HDBSCAN
from stepmix.stepmix import StepMix

# Evaluation
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import torch
from torchmetrics.clustering import DunnIndex
from kneed import KneeLocator

# Data, parameters and validity indexes

In [None]:
data2004_i = pd.read_parquet("data/data2004_i.parquet") # load imputed data

# Dataset with numeric outcomes
data_n = data2004_i[[
    'clseusa_n', 'ambornin_n', 'amcit_n', 'amlived_n', 'amenglsh_n', 
     'amchrstn_n', 'amgovt_n', 'amfeel_n', 'amcitizn_n', 'amshamed_n', 
     'belikeus_n', 'ambetter_n', 'ifwrong_n', 'proudsss_n', 'proudgrp_n', 
     'proudpol_n', 'prouddem_n', 'proudeco_n', 'proudspt_n', 'proudart_n', 
     'proudhis_n', 'proudmil_n', 'proudsci_n']]

# Dataset with categorical outcomes
data_f = data2004_i[[
     'clseusa_f', 'ambornin_f', 'amcit_f', 'amlived_f', 'amenglsh_f', 
     'amchrstn_f', 'amgovt_f', 'amfeel_f', 'amcitizn_f', 'amshamed_f', 
     'belikeus_f', 'ambetter_f', 'ifwrong_f', 'proudsss_f', 'proudgrp_f', 
     'proudpol_f', 'prouddem_f', 'proudeco_f', 'proudspt_f', 'proudart_f', 
     'proudhis_f', 'proudmil_f', 'proudsci_f']]

# Dataset with controls
controls = data2004_i[[
    'sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 
    'reltrad_f', 'region_f']]

In [None]:
max_clust = 12

In [None]:
# Custom score functions to avoid throwing errors when undefined
def sil_score(data, pred_clust):
    try:
        sil_score = silhouette_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

def ch_score(data, pred_clust):
    try:
        ch_score = calinski_harabasz_score(data, pred_clust)
    except ValueError:
        ch_score = np.nan
    return ch_score

def db_score(data, pred_clust):
    try:
        db_score = davies_bouldin_score(data, pred_clust)
    except ValueError:
        db_score = np.nan
    return db_score

def dunn_score(data, pred_clust):
    torch_data = np.array(data)
    torch_data = torch.tensor(torch_data, dtype=torch.float32)
    torch_pred_clust = torch.tensor(pred_clust, dtype=torch.int64)

    dunn_metric = DunnIndex()
    
    try:
        dunn_score = float(dunn_metric(torch_data, torch_pred_clust))
    except Exception:
        dunn_score = np.nan
 
    return dunn_score

def inertia(data, labels):
    data = np.asarray(data)
    
    inertia = 0
    for cluster in np.unique(labels):
        cluster_points = data[labels == cluster]
        cluster_centroid = np.mean(cluster_points, axis=0)
        inertia += np.sum((cluster_points - cluster_centroid) ** 2)
        
    return inertia

In [None]:
# should add min and average cluster size

In [None]:
# Function to display the optimal numbers of clutsters according to each validity index
def elbow_plot(df, val_index):
    res = df.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')

    plt.figure(figsize=(8, 5))
    plt.plot(x, y, marker="o", linestyle="-", label=val_index)
    plt.axvline(x=knee_locator.knee, color="r", linestyle="--", label=f"Optimal k={knee_locator.knee}")
    plt.xlabel("Number of Clusters")
    plt.ylabel(f"{val_index} index")
    plt.title(f"Elbow Method for {val_index} index")
    plt.legend()
    plt.show()

# Latent models
With the StepMix package

Documentation : https://github.com/Labo-Lacourse/stepmix

In [None]:
clust_range = range(1, max_clust)

## Without covariates

In [None]:
def do_StepMix(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        model = StepMix(
            n_components = n, 
            measurement = type, 
            n_init = 3)
        
        model.fit(data)
        pred_clust = model.predict(data)

        return {
            'model': 'LCA' if type == 'categorical' else 'LPA',
            'params': 'no covariates',
            'n_clust': n,
            'aic': model.aic(data),
            'bic': model.bic(data),
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        }

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_results = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'categorical', data) for n in clust_range)
LCA_all = pd.DataFrame(cat_results)

num_results = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'continuous', data_n) for n in clust_range)
LPA_all = pd.DataFrame(num_results)

In [None]:
for val_index in ('silhouette','calinski_harabasz', 'davies_bouldin', 'dunn', 'inertia'):
    elbow_plot(LPA_all, val_index)

## With covariates

In [None]:
def do_StepMix_covar(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        model = StepMix(
            n_components = n, 
            measurement = type, 
            n_init = 3,
            n_steps = 1,
            structural = 'covariate', 
            structural_params = opt_params,
            init_params = 'kmeans',
            random_state = 123)
        
        model.fit(data, controls_dum)
        pred_clust = model.predict(data)

        return {
            'model': 'LCA' if type == 'categorical' else 'LPA',
            'params': 'with covariates',
            'n_clust': n,
            'aic': model.aic(data),
            'bic': model.bic(data),
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        }

opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 2500,
}

controls_dum = pd.get_dummies(controls)

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_results = Parallel(n_jobs=8)(delayed(do_StepMix_covar)(n, 'categorical', data) for n in clust_range)
LCA_covar_all = pd.DataFrame(cat_results)

num_results = Parallel(n_jobs=8)(delayed(do_StepMix_covar)(n, 'continuous', data_n) for n in clust_range)
LPA_covar_all = pd.DataFrame(num_results)

## Best latent models

In [None]:
# Find best models according to absolute fit = min aic / bic

In [None]:
# Find best models according to relative fit = LRT / BLRT / BVR (LCA only)

In [None]:
# Find best model through the Elbow method
## Work in progress
best_latent = pd.DataFrame()

def elbow_method(df, val_index):
    res = df.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

models = [LCA_all, LPA_all]
val_indexes = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn', 'inertia']
params = product(models, val_indexes)

for model, val_index in params:
    best_model = elbow_method(model, val_index)
    best_latent = pd.concat([best_latent, best_model], ignore_index=True)

In [None]:
# Find absolute best models for each validity index

# K-means

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

results = []

def do_kmeans(n): 
    kmeans = KMeans(
        n_clusters = n, 
        init = 'k-means++', 
        n_init = 25,
        random_state=42)

    pred_clust = kmeans.fit_predict(data)
            
    return{
        'model': 'kmeans',
        'params': 'centroid',
        'n_clust': n,
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust),
        'dunn': dunn_score(data, pred_clust),
        'inertia': inertia(data, pred_clust)
    }   

clust_range = range(1, max_clust)

results = Parallel(n_jobs=8)(delayed(do_kmeans)(n) for n in clust_range)

kmeans_all = pd.DataFrame(results)

In [None]:
# Add other models, which are not implemented in sklearn

In [None]:
# Find best model for each combination of parameters through the Elbow method
kmeans_elbow = pd.DataFrame()

def elbow_method(val_index):
    res = kmeans_all.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

val_indexes = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn', 'inertia']

for val_index in val_indexes:
    best_mod = elbow_method(val_index)
    kmeans_elbow = pd.concat([kmeans_elbow, best_mod], ignore_index=True)

In [None]:
# Find absolute best models for each validity index
kmeans_elbow = kmeans_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = kmeans_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = kmeans_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = kmeans_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = kmeans_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = kmeans_elbow.sort_values('inertia', ascending=False).iloc[0]

kmeans_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
kmeans_best = kmeans_best.drop_duplicates().reset_index(drop=True)

# AHC

In [None]:
# Fit the models
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

results = []

def do_AHC(n, dist, link):
    ahc = AgglomerativeClustering(
        n_clusters = n,
        metric = dist,
        linkage = link)
    
    ahc.fit(data)
    
    pred_clust = ahc.labels_

    return {
        'model': 'AHC',
        'params': f"distance = {dist}, linkage = {link}",
        'n_clust': n,
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust),
        'dunn': dunn_score(data, pred_clust),
        'inertia': inertia(data, pred_clust)
    }

clust_range = range(1, max_clust)
distances = ['manhattan', 'euclidean', 'chebyshev']
linkages = ['single', 'average', 'complete']
params = product(clust_range, distances, linkages)

results = Parallel(n_jobs=8)(delayed(do_AHC)(n, dist, link) for n, dist, link in params)

results.extend([do_AHC(n, 'euclidean', 'ward') for n in clust_range])

ahc_all = pd.DataFrame(results)

In [None]:
# Find best model for each combination of parameters through the Elbow method
ahc_elbow = pd.DataFrame()

def elbow_method(dist, link, val_index):
    params = f"distance = {dist}, linkage = {link}"
    res = ahc_all[ahc_res['params'] == params]
    
    res = res.dropna(subset=[val_index])

    x = res["n_clust"]
    y = res[val_index]

    if val_index == 'davies_bouldin':
        knee_locator = KneeLocator(x, y, curve='concave', direction='increasing')
    else:
        knee_locator = KneeLocator(x, y, curve='convex', direction='decreasing')
    
    return res[res["n_clust"] == knee_locator.knee]

distances = ['manhattan', 'euclidean', 'chebyshev']
linkages = ['single', 'average', 'complete']
models = product(distances, linkages)

val_indexes = ['silhouette', 'calinski_harabasz', 'davies_bouldin', 'dunn', 'inertia']

for dist, link in models:
    for val_index in val_indexes:
        best_mod = elbow_method(dist, link, val_index)
        ahc_elbow = pd.concat([ahc_elbow, best_mod], ignore_index=True)

In [None]:
# Find absolute best models for each validity index
ahc_elbow = ahc_elbow.drop_duplicates().reset_index(drop=True)
# Need to add colums indicating which validity index is maximized.
# After that, duplicate models should be merged, not dropped.

best_silhouette = ahc_elbow.sort_values('silhouette', ascending=False).iloc[0]
best_ch = ahc_elbow.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = ahc_elbow.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = ahc_elbow.sort_values('dunn', ascending=False).iloc[0]
best_inertia = ahc_elbow.sort_values('inertia', ascending=False).iloc[0]

ahc_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
ahc_best = ahc_best.drop_duplicates().reset_index(drop=True)

# HDBSCAN

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

results = []

def do_hdbscan(dist, min_c, min_s):
    hdb = HDBSCAN(
        metric = dist,
        min_cluster_size = min_c, 
        min_samples = min_s)
        
    pred_clust = hdb.fit_predict(data)
        
    n_clusters = len(set(pred_clust[pred_clust != -1]))
    noise_freq = 100 * sum(pred_clust == -1) / len(pred_clust)
        
    return {
        'model': 'HDBSCAN',
        'params': f"distance = {dist}, min_cluster_size = {min_c}, min_samples = {min_s}",
        'n_clust': n_clusters,
        'noise': noise_freq,
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust),
        'dunn': dunn_score(data, pred_clust),
        'inertia': inertia(data, pred_clust)
    }

distances = ['euclidean', 'chebyshev']
min_cluster_sizes = range(2, 16)
min_samples_range = range(1, 16)
params = product(distances, min_cluster_sizes, min_samples_range)

results = Parallel(n_jobs=8)(delayed(do_hdbscan)(dist, min_c, min_s) for dist, min_c, min_s in params)

hdbscan_all = pd.DataFrame(results)

In [None]:
# The Elbow method is inapplicable here. We simply select the model maximizing each validity index.
best_silhouette = hdbscan_all.sort_values('silhouette', ascending=False).iloc[0]
best_ch = hdbscan_all.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = hdbscan_all.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = hdbscan_all.sort_values('dunn', ascending=False).iloc[0]
best_inertia = hdbscan_all.sort_values('inertia', ascending=False).iloc[0]

hdbscan_best = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
hdbscan_best = hdbscan_best.drop_duplicates().reset_index(drop=True)

In [None]:
# Histogram grouping the values above 15
bins = list(range(1, 16)) + [15.5]
labels = list(range(1, 15)) + ['15+']
plot_data = hdbscan_all['n_clust'].apply(lambda x: x if x <= 15 else 15.5)

plt.figure(figsize=(8, 4))
plt.hist(plot_data, bins=bins, edgecolor='black', align='left', rwidth=0.8)
plt.xticks(bins[:-1], labels)
plt.xlabel('Number of Clusters')
plt.ylabel('Number of Models')
plt.title('Number of clusters selected by HDBSCAN models')
plt.show()

# Aggregate and display results

In [None]:
best_mod_list = [kmeans_best, ahc_best, hdbscan_best]
best_models = pd.concat(best_mod_list, ignore_index=True)

In [None]:
best_models

In [None]:
# Selecting the best performing model on each criteria across model classes eliminates hdbscan models
# Which could mean hdbscan is underperforming
# Or is picking non-convex clusters
# Or that data is non-clusterable!
best_mod_list = [kmeans_best, ahc_best, hdbscan_best]
best_models = pd.concat(best_mod_list, ignore_index=True)

best_silhouette = best_models.sort_values('silhouette', ascending=False).iloc[0]
best_ch = best_models.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_db = best_models.sort_values('davies_bouldin', ascending=True).iloc[0]
best_dunn = best_models.sort_values('dunn', ascending=False).iloc[0]
best_inertia = best_models.sort_values('inertia', ascending=False).iloc[0]

best_models = pd.DataFrame([best_silhouette, best_ch, best_db, best_dunn, best_inertia])
best_models = best_models.drop_duplicates().reset_index(drop=True)

In [None]:
best_models

In [None]:
# Histogram
bins = np.arange(best_models['n_clust'].min() - 0.5, best_models['n_clust'].max() + 1.5, 1)

plt.figure(figsize=(8, 4))
plt.hist(best_models['n_clust'], bins=bins, edgecolor='black', rwidth=0.8)
plt.xlabel('Number of Clusters')
plt.ylabel('Number of Models')
plt.title('Optimal number of clusters according to best models')
plt.show()

# Clusters visualization 

In [None]:
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

In [None]:
# PCA to represent the clusters in 2D
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(data)

## For kmeans

In [None]:
# Fit an arbitrary model
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

kmeans = KMeans(n_clusters=7, random_state=42)
pred_clust = kmeans.fit_predict(data)

### Datapoints alone

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=pred_clust, cmap='tab10', s=20, edgecolors='k')
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.axhline(y=0, color='#333333', linestyle='--', linewidth=1)
plt.axvline(x=0, color='#333333', linestyle='--', linewidth=1)
plt.title("Clusters")
plt.show()

### With decision boundaries

In [None]:
# Create a grid for boundary visualization in 2D space
x_min, x_max = X_reduced[:, 0].min() - 0.5, X_reduced[:, 0].max() + 0.5
y_min, y_max = X_reduced[:, 1].min() - 0.5, X_reduced[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))

# Project grid points back to original space
grid_points_2D = np.c_[xx.ravel(), yy.ravel()]
grid_points_original = pca.inverse_transform(grid_points_2D)

# Predict clusters in the original space
grid_clusters = kmeans.predict(grid_points_original).reshape(xx.shape)

plt.figure(figsize=(8, 6))

# Create scatter plot first to get the color mapping
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot boundaries using the same colormap and normalization
plt.contourf(xx, yy, grid_clusters, 
             alpha=0.3, 
             cmap=scatter.cmap,
             norm=scatter.norm)

# Plot centroids with labels
centroids_pca = pca.transform(kmeans.cluster_centers_)
for i, (x, y) in enumerate(centroids_pca):
    plt.text(x, y, str(i), color='white', fontsize=12, 
             ha='center', va='center', fontweight='bold',
             bbox=dict(facecolor='black', edgecolor='none', boxstyle='round,pad=0.2'))

plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Decision Boundaries")
plt.show()

### With convex hulls

In [None]:
plt.figure(figsize=(8, 6))

# Collect all hull vertices
hull_vertices = []
hull_colors = []
for i in range(kmeans.n_clusters):
    cluster_points = X_reduced[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

# Plot datapoints
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot all hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.3,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()

## For HDBSCAN
Example of non-convex clusters in the PCA space

In [None]:
# Example of non-convex clusters in the PCA space
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

hdb = HDBSCAN(min_cluster_size = 5, min_samples = 1)  
pred_clust = hdb.fit_predict(data)
n_clusters = len(set(pred_clust[pred_clust != -1]))

In [None]:
# Best-performing model
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

hdb = HDBSCAN(metric = 'euclidean', min_cluster_size = 4, min_samples = 2)  
pred_clust = hdb.fit_predict(data)
n_clusters = len(set(pred_clust[pred_clust != -1]))

In [None]:
plt.figure(figsize=(8, 6))

# Collect all hull vertices
hull_vertices = []
hull_colors = []
for i in range(n_clusters):
    cluster_points = X_reduced[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

# Plot datapoints
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot all hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.7,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()