In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans, HDBSCAN

from stepmix.stepmix import StepMix
# Documentation : https://github.com/Labo-Lacourse/stepmix

# Data and parameters

In [2]:
data2004_i = pd.read_parquet("data/data2004_i.parquet") # load imputed data

# Dataset with numeric outcomes
data_n = data2004_i[[
    'clseusa_n', 'ambornin_n', 'amcit_n', 'amlived_n', 'amenglsh_n', 
     'amchrstn_n', 'amgovt_n', 'amfeel_n', 'amcitizn_n', 'amshamed_n', 
     'belikeus_n', 'ambetter_n', 'ifwrong_n', 'proudsss_n', 'proudgrp_n', 
     'proudpol_n', 'prouddem_n', 'proudeco_n', 'proudspt_n', 'proudart_n', 
     'proudhis_n', 'proudmil_n', 'proudsci_n']]

# Dataset with categorical outcomes
data_f = data2004_i[[
     'clseusa_f', 'ambornin_f', 'amcit_f', 'amlived_f', 'amenglsh_f', 
     'amchrstn_f', 'amgovt_f', 'amfeel_f', 'amcitizn_f', 'amshamed_f', 
     'belikeus_f', 'ambetter_f', 'ifwrong_f', 'proudsss_f', 'proudgrp_f', 
     'proudpol_f', 'prouddem_f', 'proudeco_f', 'proudspt_f', 'proudart_f', 
     'proudhis_f', 'proudmil_f', 'proudsci_f']]

# Dataset with controls
controls = data2004_i[[
    'sex', 'race_f', 'born_usa', 'party_fs', 'religstr_f', 
    'reltrad_f', 'region_f']]

In [3]:
clust_range = range(1,13)

In [4]:
# Suppress warnings thrown by StepMix using a deprectaed version of scikit learn
warnings.simplefilter(action='ignore', category=FutureWarning)

# Performance metrics

In [5]:
# Custom score functions to avoid throwing errors when undefined
def sil_score(data, pred_clust):
    try:
        sil_score = silhouette_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

def ch_score(data, pred_clust):
    try:
        sil_score = calinski_harabasz_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

def db_score(data, pred_clust):
    try:
        sil_score = davies_bouldin_score(data, pred_clust)
    except ValueError:
        sil_score = np.nan
    return sil_score

In [6]:
# Attempt at inertia
# Which returns per-variable values as is
def compute_inertia(data, labels):
    inertia = 0
    for cluster in np.unique(labels):
        cluster_points = data[labels == cluster]
        cluster_centroid = np.mean(cluster_points, axis=0)
        inertia += np.sum((cluster_points - cluster_centroid) ** 2)
    return inertia

def compute_inertia_2(data, labels):
    # Compute cluster centroids
    unique_labels = np.unique(labels)
    centroids = np.array([data[labels == label].mean(axis=0) for label in unique_labels])
    
    # Compute total inertia for the entire dataset
    total_inertia = np.sum(
        [np.sum((data[labels == label] - centroids[i]) ** 2) 
         for i, label in enumerate(unique_labels)]
    )
    
    return total_inertia

data = np.array([[1, 2], [1, 4], [1, 0],
                 [10, 2], [10, 4], [10, 0]])
labels = np.array([0, 0, 0, 1, 1, 1])

total_inertia = compute_inertia(data, labels)
print(total_inertia)

16.0


# Latent class analysis
## Without covariates

In [None]:
results = []
data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))

for n in clust_range:
    model = StepMix(n_components=n, 
                    measurement='categorical', 
                    n_init=5)

    model.fit(data)
    pred_clust = model.predict(data)
    
    results.append({
        'n_components': n,
        'aic': model.aic(data),
        'bic': model.bic(data),
        'inertia': compute_inertia(data, pred_clust),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust)
    })

StepMix_fnc_results = pd.DataFrame(results)

In [None]:
StepMix_fnc_results

## With covariates

In [None]:
# Does not converge above 3 classes
# Try with other methods than newton-raphson, like gradient? But really slow, needs parallelizing.
# Or without imputed data, which may cause colinearity issues?
results = []

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
controls_dum = pd.get_dummies(controls)

opt_params = {
    'method': 'newton-raphson',
    'intercept': True,
    'max_iter': 500,
}

for n in clust_range:
    model = StepMix(n_components=n, 
                    measurement='categorical', 
                    n_init=3,
                    n_steps=1,
                    structural='covariate', 
                    structural_params=opt_params,
                    init_params='kmeans',
                    verbose=0, 
                    random_state=123)
    model.fit(data, controls_dum)
    pred_clust = model.predict(data)
    
    results.append({
        'n_components': n,
        'aic': model.aic(data),
        'bic': model.bic(data),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust)
    })

StepMix_fwc_results = pd.DataFrame(results)

In [None]:
# Attempt at parallelizing, does not seem to work within notebooks
from concurrent.futures import ProcessPoolExecutor

clust_range = range(1,6)

results = []

data = data_f.apply(lambda col: LabelEncoder().fit_transform(col))
controls_dum = pd.get_dummies(controls)

opt_params = {
    'method': 'gradient',
    'intercept': True,
    'max_iter': 1000,
}

def run_StepMix(n):
    model = StepMix(n_components=n, 
                    measurement='categorical', 
                    n_init=3,
                    n_steps=1,
                    structural='covariate', 
                    structural_params=opt_params,
                    init_params='kmeans',
                    verbose=0, 
                    random_state=123)
    model.fit(data, controls_dum)
    pred_clust = model.predict(data)

    return {
        'n_components': n,
        'aic': model.aic(data),
        'bic': model.bic(data),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust)
    }

with ProcessPoolExecutor(max_workers=8) as executor:
    results = list(executor.map(run_StepMix, clust_range))

StepMix_fwc_results = pd.DataFrame(results)

In [None]:
StepMix_fwc_results

# Latent profile analysis
## Without covariates

In [None]:
results = []
data = data_n

for n in clust_range:
    model = StepMix(n_components=n, 
                    measurement='continuous', 
                    n_init=5)
    model.fit(data)
    pred_clust = model.predict(data)
    
    results.append({
        'n_components': n,
        'aic': model.aic(data),
        'bic': model.bic(data),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust)
    })

StepMix_nnc_results = pd.DataFrame(results)

In [None]:
StepMix_nnc_results

## With covariates

In [None]:
# Does not converge either?
results = []

data = data_n
controls_dum = pd.get_dummies(controls)

opt_params = {
    'method': 'newton-raphson',
    'intercept': True,
    'max_iter': 500,
}

for n in clust_range:
    model = StepMix(n_components=n, 
                    measurement='continuous', 
                    n_init=3,
                    n_steps=1,
                    structural='covariate', 
                    structural_params=opt_params,
                    init_params='kmeans',
                    verbose=0, 
                    random_state=123)
    model.fit(data, controls_dum)
    pred_clust = model.predict(data)

    results.append({
        'n_components': n,
        'aic': model.aic(data),
        'bic': model.bic(data),
        'silhouette': sil_score(data, pred_clust),
        'calinski_harabasz': ch_score(data, pred_clust),
        'davies_bouldin': db_score(data, pred_clust)
    })

StepMix_nwc_results = pd.DataFrame(results)

In [None]:
StepMix_nwc_results

# K-means

In [None]:
data = data_n
results = []

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

for n_clusters in clust_range:
    for init_method in ['k-means++', 'random']:
        for n_init in [5, 10]:
            kmeans = KMeans(
                n_clusters=n_clusters, 
                init=init_method, 
                n_init=n_init, 
                random_state=42
            )

            pred_clust = kmeans.fit_predict(scaled_data)
            computed_inertia = compute_inertia(data, pred_clust)
            results.append({
                'n_clusters': n_clusters,
                'init_method': init_method,
                'n_init': n_init,
                'inertia': kmeans.inertia_,
                'computed_inertia': computed_inertia,
                'silhouette': sil_score(scaled_data, pred_clust),
                'calinski_harabasz': calinski_harabasz_score(scaled_data, pred_clust),
                'davies_bouldin': davies_bouldin_score(scaled_data, pred_clust)
            })

results = pd.DataFrame(results)

best_silhouette = results.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = results.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = results.sort_values('davies_bouldin', ascending=True).iloc[0]  # Lower is better

inertias = results.groupby('n_clusters')['inertia'].mean()

In [None]:
best_silhouette

# HDBSCAN

In [None]:
results = []
min_cluster_sizes = range(2, 16)
min_samples_range = range(1, 16)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

for min_cluster_size in min_cluster_sizes:
    for min_samples in min_samples_range:
        hdb = HDBSCAN(min_cluster_size=min_cluster_size, 
                               min_samples=min_samples)
        pred_clust = hdb.fit_predict(scaled_data)

        n_clusters = len(set(pred_clust[pred_clust != -1]))
        
        results.append({
                'min_cluster_size': min_cluster_size,
                'min_samples': min_samples,
                'n_clusters': n_clusters,
                'silhouette': sil_score(scaled_data, pred_clust),
                'calinski_harabasz': ch_score(scaled_data, pred_clust),
                'davies_bouldin': db_score(scaled_data, pred_clust)
            })

results = pd.DataFrame(results)

best_silhouette = results.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = results.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = results.sort_values('davies_bouldin', ascending=True).iloc[0]

In [None]:
best_silhouette

In [None]:
compute_inertia(data, pred_clust)

# Spectral clustering

In [None]:
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.neighbors import kneighbors_graph

results = []
affinity_methods = ['nearest_neighbors', 'rbf']  # Different affinity computations
kernel_params = [0.1, 0.5, 1.0, 2.0]  # Different gamma values for RBF kernel

# Standardize the data first
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

for n_clusters in clust_range:
    for affinity in affinity_methods:
        for gamma in kernel_params:
            # For nearest_neighbors affinity, create a connectivity matrix
            if affinity == 'nearest_neighbors':
                connectivity = kneighbors_graph(scaled_data, n_neighbors=10, mode='connectivity')
                connectivity = connectivity.toarray()
            else:
                connectivity = None
            
            # Perform Spectral Clustering
            spectral = SpectralClustering(
                n_clusters=n_clusters,
                affinity=affinity,
                gamma=gamma,
                random_state=42
            )
            
            # Fit the model and predict clusters
            pred_clust = spectral.fit_predict(scaled_data)
            
            # Count unique clusters
            unique_clusters = len(np.unique(pred_clust))
            
            # Compute metrics (only if more than one cluster)
            if unique_clusters > 1:
                results.append({
                    'n_clusters': n_clusters,
                    'affinity': affinity,
                    'gamma': gamma,
                    'n_unique_clusters': unique_clusters,
                    'silhouette': sil_score(scaled_data, pred_clust),
                    'calinski_harabasz': calinski_harabasz_score(scaled_data, pred_clust),
                    'davies_bouldin': davies_bouldin_score(scaled_data, pred_clust)
                })

# Convert results to DataFrame
results = pd.DataFrame(results)

# Find best configurations by different metrics
best_silhouette = results.sort_values('silhouette', ascending=False).iloc[0]
best_calinski = results.sort_values('calinski_harabasz', ascending=False).iloc[0]
best_davies = results.sort_values('davies_bouldin', ascending=True).iloc[0]

In [None]:
best_silhouette

# Affinity propagation

In [None]:
from sklearn.cluster import AffinityPropagation

model = AffinityPropagation(damping=0.7, max_iter=350, convergence_iter=25)
model.fit(data)
pred_clust = model.labels_

results = []

results.append({
        'n_components': len(set(pred_clust)),
        'silhouette': silhouette_score(data, pred_clust),
        'calinski_harabasz': calinski_harabasz_score(data, pred_clust),
        'davies_bouldin': davies_bouldin_score(data, pred_clust)
})
    
results = pd.DataFrame(results)

In [None]:
results