We will learn how to perform clustering in this section. First, we will write the evaluation function, which provides various clustering metrics. For each metric, the closer the value is to 1, the better.


In [2]:
import numpy as np
from sklearn.metrics import (
    normalized_mutual_info_score,
    adjusted_rand_score,
    precision_score,
    recall_score,
    f1_score
)
from scipy.optimize import linear_sum_assignment
from collections import Counter

def compute_label_alignment(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    row_ind, col_ind = linear_sum_assignment(w.max() - w)
    mapping = {row: col for row, col in zip(row_ind, col_ind)}
    y_pred_aligned = np.array([mapping[label] for label in y_pred])
    acc = sum(w[i, j] for i, j in zip(row_ind, col_ind)) / y_pred.size
    return acc, y_pred_aligned

def purity_score(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    total = 0
    for cluster in np.unique(y_pred):
        indices = np.where(y_pred == cluster)[0]
        true_labels = y_true[indices]
        most_common = Counter(true_labels).most_common(1)
        if most_common:
            total += most_common[0][1]
    return total / len(y_true)

def evaluate(y_true, y_pred, method='macro'):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # ACC & aligned labels
    acc, y_pred_aligned = compute_label_alignment(y_true, y_pred)

    # Metrics
    nmi = normalized_mutual_info_score(y_true, y_pred)
    purity = purity_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred_aligned, average=method, zero_division=0)
    recall = recall_score(y_true, y_pred_aligned, average=method, zero_division=0)
    f1 = f1_score(y_true, y_pred_aligned, average=method, zero_division=0)
    ari = adjusted_rand_score(y_true, y_pred)

    return np.array([acc, nmi, purity, f1, precision, recall, ari])


Import the necessary packages.


In [3]:
import torch
from manify.manifolds import ProductManifold
from manify.clustering.fuzzy_kmeans import RiemannianFuzzyKMeans
import numpy as np

First, generate a **Product Manifold** using the following method.

In [4]:
# 1. Define the signature: a 3-factor manifold
import numpy as np
#    (curvature, dimension)
signature = [
    (0.0, 4),   # R^2 (Euclidean space)
    (1.0, 4),   # S^2 (Spherical space)
    (-1.0, 4),  # H^2 (Hyperbolic space)
]

# 2. Construct the ProductManifold (without stereographic projection)
P = ProductManifold(signature, device="cpu", stereographic=False)

In [5]:
#setting param
n_clusters = 3
seed = 0
opt = 'adan'
lr = .01
tol = 1e-6

In [6]:
# 3. Generate data using gaussian_mixture
#    - num_points=500: sample 500 points
#    - num_classes=n_clusters: generate n_clusters class labels (for clustering)
#    - seed=seed: fix the random seed for reproducibility
X, y_true = P.gaussian_mixture(
    num_points=500,
    num_classes=n_clusters,
    seed=seed,
    task="classification",
    cov_scale_points=.1 # <--- try decreasing this value
)
y_true = np.array(y_true)

Call the `RiemannianFuzzyKMeans` algorithm from the `fuzzy_kmeans` module in the `manify` clustering package to perform clustering on a manifold.

In [7]:
model = RiemannianFuzzyKMeans(n_clusters, 
            manifold=P,
            random_state=seed,  
            max_iter=1000,
            tol=tol,
            optimizer=opt,
            lr=lr,
            verbose=True)
labels = model.fit_predict(X)

RFK iter 1, loss=1911.6559
RFK iter 2, loss=1909.2720
RFK iter 3, loss=1907.2013
RFK iter 4, loss=1905.3054
RFK iter 5, loss=1903.5615
RFK iter 6, loss=1901.9583
RFK iter 7, loss=1900.4875
RFK iter 8, loss=1899.1450
RFK iter 9, loss=1897.9203
RFK iter 10, loss=1896.8094
RFK iter 11, loss=1895.7996
RFK iter 12, loss=1894.8835
RFK iter 13, loss=1894.0524
RFK iter 14, loss=1893.2966
RFK iter 15, loss=1892.6089
RFK iter 16, loss=1891.9822
RFK iter 17, loss=1891.4094
RFK iter 18, loss=1890.8866
RFK iter 19, loss=1890.4080
RFK iter 20, loss=1889.9674
RFK iter 21, loss=1889.5638
RFK iter 22, loss=1889.1930
RFK iter 23, loss=1888.8501
RFK iter 24, loss=1888.5347
RFK iter 25, loss=1888.2428
RFK iter 26, loss=1887.9733
RFK iter 27, loss=1887.7229
RFK iter 28, loss=1887.4913
RFK iter 29, loss=1887.2771
RFK iter 30, loss=1887.0775
RFK iter 31, loss=1886.8916
RFK iter 32, loss=1886.7195
RFK iter 33, loss=1886.5583
RFK iter 34, loss=1886.4092
RFK iter 35, loss=1886.2698
RFK iter 36, loss=1886.1398
R

What if we don't use a manifold-based clustering method and instead apply standard KMeans? We'll compare the results to evaluate the performance difference.

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
# Fit the data
kmeans.fit(X)
# Get the cluster labels from kmeans
labels_km = kmeans.labels_

In [9]:
result = evaluate(y_true, labels).reshape(1, -1)
result2 = evaluate(y_true, labels_km).reshape(1, -1)
print(result)
print(result2)

[[0.998      0.98869808 0.998      0.99811609 0.99801587 0.99822695
  0.99363694]]
[[0.44       0.07972898 0.444      0.30688818 0.33132184 0.40742235
  0.03685404]]


The performance of **Riemannian Fuzzy KMeans** seems to be much better than that of standard **KMeans**. Let's try adjusting some parameters to see if we can improve or better understand the results!
