In [None]:
# pip install torchmetrics
# pip install stepmix

In [None]:
import pandas as pd
import numpy as np
import warnings

from joblib import Parallel, delayed # for parallelization

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, HDBSCAN

import torch
from torchmetrics.clustering import DunnIndex

from stepmix.stepmix import StepMix

# Data, parameters and performance metrics

In [None]:
data2004_i = pd.read_parquet("data/data2004_i.parquet") # load imputed data

# Dataset with numeric outcomes
data_n = data2004_i[[
    'clseusa_n', 'ambornin_n', 'amcit_n', 'amlived_n', 'amenglsh_n', 
     'amchrstn_n', 'amgovt_n', 'amfeel_n', 'amcitizn_n', 'amshamed_n', 
     'belikeus_n', 'ambetter_n', 'ifwrong_n', 'proudsss_n', 'proudgrp_n', 
     'proudpol_n', 'prouddem_n', 'proudeco_n', 'proudspt_n', 'proudart_n', 
     'proudhis_n', 'proudmil_n', 'proudsci_n']]

# Dataset with categorical outcomes
data_f = data2004_i[[
     'clseusa_f', 'ambornin_f', 'amcit_f', 'amlived_f', 'amenglsh_f', 
     'amchrstn_f', 'amgovt_f', 'amfeel_f', 'amcitizn_f', 'amshamed_f', 
     'belikeus_f', 'ambetter_f', 'ifwrong_f', 'proudsss_f', 'proudgrp_f', 
     'proudpol_f', 'prouddem_f', 'proudeco_f', 'proudspt_f', 'proudart_f', 
     'proudhis_f', 'proudmil_f', 'proudsci_f']]

# Dataset with controls
controls = data2004_i[[
    'sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 
    'reltrad_f', 'region_f']]

In [None]:
max_clust = 12

In [None]:
# Custom score functions to avoid throwing errors when undefined
def sil_score(data, pred_clust):
    try:
        sil_score = silhouette_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

def ch_score(data, pred_clust):
    try:
        ch_score = calinski_harabasz_score(data, pred_clust)
    except ValueError:
        ch_score = np.nan
    return ch_score

def db_score(data, pred_clust):
    try:
        db_score = davies_bouldin_score(data, pred_clust)
    except ValueError:
        db_score = np.nan
    return db_score

def dunn_score(data, pred_clust):
    torch_data = np.array(data)
    torch_data = torch.tensor(torch_data, dtype=torch.float32)
    torch_pred_clust = torch.tensor(pred_clust, dtype=torch.int64)

    dunn_metric = DunnIndex()
    
    try:
        dunn_score = float(dunn_metric(torch_data, torch_pred_clust))
    except Exception:
        dunn_score = np.nan
 
    return dunn_score

def inertia(data, labels):
    data = np.asarray(data)
    
    inertia = 0
    for cluster in np.unique(labels):
        cluster_points = data[labels == cluster]
        cluster_centroid = np.mean(cluster_points, axis=0)
        inertia += np.sum((cluster_points - cluster_centroid) ** 2)
        
    return inertia

In [None]:
# df to store the results of kneelocator
best_models = []

# Latent models
With the StepMix package

Documentation : https://github.com/Labo-Lacourse/stepmix

In [None]:
clust_range = range(1, max_clust)

## Without covariates

In [None]:
def do_StepMix(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        model = StepMix(
            n_components = n, 
            measurement = type, 
            n_init = 3)
        
        model.fit(data)
        pred_clust = model.predict(data)

        return {
            'n_components': n,
            'aic': model.aic(data),
            'bic': model.bic(data),
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        }

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_res = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'categorical', data) for n in clust_range)
LCA_res = pd.DataFrame(cat_res)

num_res = Parallel(n_jobs=8)(delayed(do_StepMix)(n, 'continuous', data_n) for n in clust_range)
LPA_res = pd.DataFrame(num_res)

In [None]:
# apply kneelocator to LCA_res and LPA_res here (for each performance metrics)

## With covariates

In [None]:
def do_StepMix_covar(n, type, data):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=FutureWarning)

        model = StepMix(
            n_components = n, 
            measurement = type, 
            n_init = 3,
            n_steps = 1,
            structural = 'covariate', 
            structural_params = opt_params,
            init_params = 'kmeans',
            random_state = 123)
        
        model.fit(data, controls_dum)
        pred_clust = model.predict(data)

        return {
            'n_components': n,
            'aic': model.aic(data),
            'bic': model.bic(data),
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        }

opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 2500,
}

controls_dum = pd.get_dummies(controls)

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
cat_res = Parallel(n_jobs=8)(delayed(do_StepMix_covar)(n, 'categorical', data) for n in clust_range)
LCA_covar_res = pd.DataFrame(cat_res)

num_res = Parallel(n_jobs=8)(delayed(do_StepMix_covar)(n, 'continuous', data_n) for n in clust_range)
LPA_covar_res = pd.DataFrame(num_res)

In [None]:
# apply kneelocator to LCA_covar_res and LPA_covar_res here

# K-means

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

results = []

for n_init in [5, 10, 20]:
    for n_clusters in range(2, max_clust):
        kmeans = KMeans(
            n_clusters = n_clusters, 
            init = 'k-means++', 
            n_init = n_init, 
            random_state=42
        )

        pred_clust = kmeans.fit_predict(data)
            
        results.append({
            'n_clusters': n_clusters,
            'n_init': n_init,
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        })
        
        # kneelocator here
        # add results to best_models

kmeans_res = pd.DataFrame(results)

In [None]:
best_silhouette = kmeans_res.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = kmeans_res.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = kmeans_res.sort_values('davies_bouldin', ascending=True).iloc[0] # Lower is better

In [None]:
# Example to compute the average inertia across models with the same number of clusters
# kmeans_res.groupby('n_clusters')['inertia'].mean()

# AHC

# HDBSCAN

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

min_cluster_sizes = range(2, 16)
min_samples_range = range(1, 16)

# min_samples_range = [5, 10, 20, 50]
# min_cluster_sizes = [5, 10, 20, 30, 40, 50, 60]

results = []

for min_cluster_size in min_cluster_sizes:
    for min_samples in min_samples_range:
        hdb = HDBSCAN(
            min_cluster_size = min_cluster_size, 
            min_samples = min_samples)
        
        pred_clust = hdb.fit_predict(data)
        
        n_clusters = len(set(pred_clust[pred_clust != -1]))
        noise_freq = 100 * sum(pred_clust == -1) / len(pred_clust)
        
        results.append({
            'min_cluster_size': min_cluster_size,
            'min_samples': min_samples,
            'n_clusters': n_clusters,
            'noise': noise_freq,
            'silhouette': sil_score(data, pred_clust),
            'calinski_harabasz': ch_score(data, pred_clust),
            'davies_bouldin': db_score(data, pred_clust),
            'dunn': dunn_score(data, pred_clust),
            'inertia': inertia(data, pred_clust)
        })

hdbscan_res = pd.DataFrame(results)

In [None]:
best_silhouette = hdbscan_res.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = hdbscan_res.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = hdbscan_res.sort_values('davies_bouldin', ascending=True).iloc[0]

In [None]:
# apply kneelocator to hdbscan_res here

In [None]:
hdbscan_res['n_clusters'].unique()

# Spectral clustering

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.neighbors import kneighbors_graph

results = []
affinity_methods = ['nearest_neighbors', 'rbf']  # Different affinity computations
kernel_params = [0.1, 0.5, 1.0, 2.0]  # Different gamma values for RBF kernel

scaler = StandardScaler()
data = scaler.fit_transform(data)

for n_clusters in range(2, max_clust):
    for affinity in affinity_methods:
        for gamma in kernel_params:
            # For nearest_neighbors affinity, create a connectivity matrix
            if affinity == 'nearest_neighbors':
                connectivity = kneighbors_graph(data, n_neighbors=10, mode='connectivity')
                connectivity = connectivity.toarray()
            else:
                connectivity = None
            
            spectral = SpectralClustering(
                n_clusters = n_clusters,
                affinity = affinity,
                gamma = gamma,
                random_state = 42
            )
            
            pred_clust = spectral.fit_predict(data)
            
            unique_clusters = len(np.unique(pred_clust))

            results.append({
                'n_clusters': n_clusters,
                'affinity': affinity,
                'gamma': gamma,
                'n_clusters': unique_clusters,
                'silhouette': sil_score(data, pred_clust),
                'calinski_harabasz': ch_score(data, pred_clust),
                'davies_bouldin': db_score(data, pred_clust),
                'dunn': dunn_score(data, pred_clust),
                'inertia': inertia(data, pred_clust)
            })

spec_res = pd.DataFrame(results)

best_silhouette = spec_res.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = spec_res.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = spec_res.sort_values('davies_bouldin', ascending=True).iloc[0]

# Affinity propagation

In [None]:
from sklearn.cluster import AffinityPropagation

data = data_n

model = AffinityPropagation(damping=0.7, max_iter=350, convergence_iter=25)
model.fit(data)
pred_clust = model.labels_

results = []

results.append({
        'n_components': len(set(pred_clust)),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust),
        'dunn': dunn_score(data, pred_clust),
        'inertia': inertia(data, pred_clust)
})
    
af_res = pd.DataFrame(results)

# Aggregate results

In [None]:
# Results df to merge, after harmonizing their structure
# Or unecessary, if the best models are already stored in a dedicated df?
# LCA_res
# LPA_res
# LCA_covar_res
# LPA_covar_res
# kmeans_res
# hdbscan_res

# Visualization for kmeans

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

In [None]:
# PCA to represent the clusters in 2D
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(data)

In [None]:
# Fit an arbitrary model
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

kmeans = KMeans(n_clusters=7, random_state=42)
pred_clust = kmeans.fit_predict(data)

## Datapoints alone

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=pred_clust, cmap='tab10', s=20, edgecolors='k')
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.axhline(y=0, color='#333333', linestyle='--', linewidth=1)
plt.axvline(x=0, color='#333333', linestyle='--', linewidth=1)
plt.title("Clusters")
plt.show()

## With decision boundaries

In [None]:
# Create a grid for boundary visualization in 2D space
x_min, x_max = X_reduced[:, 0].min() - 0.5, X_reduced[:, 0].max() + 0.5
y_min, y_max = X_reduced[:, 1].min() - 0.5, X_reduced[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))

# Project grid points back to original space
grid_points_2D = np.c_[xx.ravel(), yy.ravel()]
grid_points_original = pca.inverse_transform(grid_points_2D)

# Predict clusters in the original space
grid_clusters = kmeans.predict(grid_points_original).reshape(xx.shape)

plt.figure(figsize=(8, 6))

# Create scatter plot first to get the color mapping
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot boundaries using the same colormap and normalization
plt.contourf(xx, yy, grid_clusters, 
             alpha=0.3, 
             cmap=scatter.cmap,
             norm=scatter.norm)

# Plot centroids with labels
centroids_pca = pca.transform(kmeans.cluster_centers_)
for i, (x, y) in enumerate(centroids_pca):
    plt.text(x, y, str(i), color='white', fontsize=12, 
             ha='center', va='center', fontweight='bold',
             bbox=dict(facecolor='black', edgecolor='none', boxstyle='round,pad=0.2'))

plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Decision Boundaries")
plt.show()

## With convex hulls

In [None]:
plt.figure(figsize=(8, 6))

# Collect all hull vertices
hull_vertices = []
hull_colors = []
for i in range(kmeans.n_clusters):
    cluster_points = X_reduced[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

# Plot datapoints
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot all hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.3,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()

# Visualization for HDBSCAN
Example of non-convex clusters in the PCA place

In [None]:
scaler = StandardScaler()
data = scaler.fit_transform(data_n)

hdb = HDBSCAN(min_cluster_size = 5, min_samples = 1)  
pred_clust = hdb.fit_predict(data)
n_clusters = len(set(pred_clust[pred_clust != -1]))

In [None]:
plt.figure(figsize=(8, 6))

# Collect all hull vertices
hull_vertices = []
hull_colors = []
for i in range(n_clusters):
    cluster_points = X_reduced[pred_clust == i]
    if len(cluster_points) > 2:
        hull = ConvexHull(cluster_points)
        hull_vertices.append((
            cluster_points[hull.vertices, 0],
            cluster_points[hull.vertices, 1]
        ))
        hull_colors.append(i)

# Plot datapoints
scatter = plt.scatter(X_reduced[:, 0], X_reduced[:, 1], 
                     c=pred_clust, cmap='tab10', 
                     s=15, edgecolors='k')

# Plot all hulls using the same colormap
for vertices, i in zip(hull_vertices, hull_colors):
    plt.fill(vertices[0], vertices[1], 
             alpha=0.7,
             color=scatter.cmap(scatter.norm(i)))

legend = plt.legend(*scatter.legend_elements())
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.title("Clusters with Convex Hulls")
plt.show()