# ARI scores for the precomputed datasets and clustering algorithms

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

In [2]:
import sys
sys.path.insert(0, '../../scripts')
import our_datasets
import our_algorithms
from tqdm import tqdm
import pickle
import sklearn
import numpy as np
from nbformat.sign import algorithms
import pandas




/user/ritzert/micromamba/envs/corc_env/lib/python3.11/site-packages/phate/__init__.py


In [4]:
datasets =     ["noisy_circles",
    "noisy_moons",
    "varied",
    "aniso",
    "blobs",]
# datasets = our_datasets.COMPLEX_DATASETS
# datasets.keys()

our_algorithms.ALGORITHM_SELECTOR

In [5]:
# copied the following list from "our_algorithms.py" script
algorithms = [
    "MiniBatch\nKMeans",
    "Agglomerative\nClustering",
    "HDBSCAN",
    "Gaussian\nMixture",
    "t-Student\nMixture",
    # "DBSCAN",
    # "BIRCH",
    # "OPTICS",
    "Spectral\nClustering",
    "Affinity\nPropagation",
    "MeanShift",
    "Leiden",
    "PAGA",
    "Ward",
    # "Stavia",
    "GWG-dip",
    "GWG-pvalue",
    "TMM-NEB",
    "GMM-NEB",
]


In [6]:
!! pwd

['/srv/user/ritzert/git/cluster_vs_continuum/notebooks/metrics']

In [7]:
cache_path = "../../cache"

In [8]:
def check_missing_files(datasets,algorithms, cache_path="../../cache"):
    missing_files = []
    for dataset_name in datasets:
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        if not os.path.exists(dataset_filename):
            missing_files.append(dataset_filename)

        for algorithm_name in algorithms:
            algorithm_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{algorithm_name}.pickle"
            if not os.path.exists(alg_filename):
                missing_files.append(alg_filename)
    return missing_files

In [27]:
def compute_ari_scores(datasets,algorithms, cache_path="../../cache"):
    ari_scores = np.zeros((len(datasets),len(algorithms)))
    for i_dataset, dataset_name in enumerate(tqdm(datasets)):
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        with open(dataset_filename, "rb") as f:
            dataset_info = pickle.load(f)
            
        X, y = dataset_info["dataset"]
        
        for i_algorithm, algorithm_name in enumerate(algorithms):
            alg_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{alg_name}.pickle"
            if not os.path.exists(alg_filename):
                print("skipping", alg_filename)
                ari_scores[i_dataset,i_algorithm] = -1.
                continue
            with open(alg_filename, "rb") as f:
                algorithm = pickle.load(f)
            if hasattr(algorithm, "labels_"):
                y_pred = algorithm.labels_.astype(int)
            else:
                if hasattr(algorithm, "predict_with_target"):
                    y_pred = algorithm.predict_with_target(X, len(np.unique(y))).astype(
                            int
                        )
                else:
                    y_pred = algorithm.predict(X)
            ari_scores[i_dataset,i_algorithm] = sklearn.metrics.adjusted_rand_score(y, y_pred)
    df = pandas.DataFrame(ari_scores.transpose())
    df.columns = datasets # sets names for the axes
    df.index = algorithms
    return df

In [29]:
def style_dataFrame(df):

    def red_green_scale(val):
        if val < 0:
            return f'background-color: rgba(255, 0, 0, {abs(val)})'  # Red, alpha scales with magnitude
        elif val > 0:
            return f'background-color: rgba(0, 128, 0, {val})'      # Green, alpha scales with magnitude
        else:
            return ''  # White (default background)
    
    # df_nlargest = df.style.apply(lambda x: ['background-color: yellow' if v == x.nlargest(1).iloc[0] else '' for v in x], axis=0).format("{:.2f}")
    df_nlargest = df.style.apply(lambda x: ['background-color: yellow' if v == x.nlargest(1).iloc[0] else '' for v in x], axis=0).format("{:.2f}")

    styled_df = df.style.applymap(red_green_scale).format("{:.2f}")

    return styled_df


In [13]:
check_missing_files(datasets,algorithms)

[]

In [28]:
ari_scores_2d = compute_ari_scores(datasets,algorithms)

100%|██████████| 5/5 [00:10<00:00,  2.08s/it]


In [31]:
style_dataFrame(ari_scores_2d)

Unnamed: 0,noisy_circles,noisy_moons,varied,aniso,blobs
MiniBatch KMeans,-0.0,0.48,0.84,0.59,0.98
Agglomerative Clustering,0.02,0.13,0.96,0.54,-0.0
HDBSCAN,1.0,1.0,0.92,0.89,0.57
Gaussian Mixture,-0.0,0.5,0.95,1.0,0.98
t-Student Mixture,-0.0,0.15,0.95,1.0,0.98
Spectral Clustering,1.0,1.0,0.95,0.85,0.97
Affinity Propagation,-0.0,0.47,0.81,0.59,0.98
MeanShift,0.01,0.54,0.86,0.52,0.98
Leiden,1.0,1.0,0.96,0.59,0.96
PAGA,0.1,0.13,0.36,0.59,0.96


# high-dimensional datasets

In [33]:
complex_datasets = our_datasets.COMPLEX_DATASETS
complex_datasets

['blobs1_8',
 'blobs1_16',
 'blobs1_32',
 'blobs1_64',
 'blobs2_8',
 'blobs2_16',
 'blobs2_32',
 'blobs2_64',
 'densired8',
 'densired16',
 'densired32',
 'densired64',
 'mnist8',
 'mnist16',
 'mnist32',
 'mnist64']

In [21]:
check_missing_files(complex_datasets,algorithms)

['../../cache/blobs1_32_TMM-NEB.pickle',
 '../../cache/blobs1_64_TMM-NEB.pickle',
 '../../cache/blobs2_64_TMM-NEB.pickle',
 '../../cache/densired64_TMM-NEB.pickle']

In [32]:
ari_scores_complex = compute_ari_scores(complex_datasets,algorithms)

  0%|          | 0/16 [00:00<?, ?it/s]

6 clusters is not achievable.
Working with 5 clusters instead.


 12%|█▎        | 2/16 [00:02<00:16,  1.15s/it]

skipping ../../cache/blobs1_32_TMM-NEB.pickle


 19%|█▉        | 3/16 [00:04<00:23,  1.81s/it]

skipping ../../cache/blobs1_64_TMM-NEB.pickle


 25%|██▌       | 4/16 [00:08<00:28,  2.36s/it]

6 clusters is not achievable.
Working with 4 clusters instead.


 38%|███▊      | 6/16 [00:11<00:19,  1.98s/it]

6 clusters is not achievable.
Working with 4 clusters instead.


 44%|████▍     | 7/16 [00:13<00:18,  2.05s/it]

skipping ../../cache/blobs2_64_TMM-NEB.pickle


 69%|██████▉   | 11/16 [00:47<00:40,  8.06s/it]

skipping ../../cache/densired64_TMM-NEB.pickle


 94%|█████████▍| 15/16 [01:41<00:13, 13.42s/it]

10 clusters is not achievable.
Working with 2 clusters instead.


100%|██████████| 16/16 [01:59<00:00,  7.44s/it]


In [34]:
style_dataFrame(ari_scores_complex)

Unnamed: 0,blobs1_8,blobs1_16,blobs1_32,blobs1_64,blobs2_8,blobs2_16,blobs2_32,blobs2_64,densired8,densired16,densired32,densired64,mnist8,mnist16,mnist32,mnist64
MiniBatch KMeans,0.65,0.68,0.94,0.73,0.56,0.34,0.37,0.58,0.63,0.63,0.5,0.74,0.57,0.44,0.32,0.26
Agglomerative Clustering,0.0,-0.0,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.19,0.44,0.53,0.0,0.0,0.0,0.0
HDBSCAN,0.0,0.0,0.0,0.0,0.23,0.08,-0.03,-0.02,0.0,0.0,0.44,0.0,0.02,0.03,0.05,0.05
Gaussian Mixture,0.69,0.92,0.92,0.72,0.37,0.53,0.37,0.57,0.46,0.63,0.5,0.64,0.75,0.71,0.62,0.57
t-Student Mixture,0.71,0.7,0.23,0.03,0.56,0.65,0.1,0.07,0.66,0.66,0.58,0.63,0.76,0.9,0.77,0.75
Spectral Clustering,0.8,0.91,0.9,0.94,0.55,0.55,0.52,0.55,0.83,0.45,0.55,0.91,0.76,0.78,0.74,0.56
Affinity Propagation,0.77,0.79,0.54,0.36,0.56,0.49,0.33,0.16,0.59,0.61,0.49,0.54,0.57,0.34,0.26,0.18
MeanShift,0.59,0.0,0.0,0.0,0.91,0.12,0.0,0.0,0.79,0.78,0.0,0.44,0.47,0.0,0.0,0.0
Leiden,0.79,0.88,0.9,0.93,0.56,0.59,0.53,0.54,0.83,0.77,0.76,0.85,0.89,0.92,0.93,0.75
PAGA,0.79,0.88,0.9,0.93,0.56,0.59,0.53,0.55,0.83,0.77,0.76,0.86,0.89,0.92,0.93,0.74
