# ARI scores for the precomputed datasets and clustering algorithms

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

In [2]:
import corc.our_datasets
import corc.our_algorithms
from tqdm import tqdm
import pickle
import sklearn
import numpy as np
import pandas


In [3]:
datasets_2d = corc.our_datasets.DATASETS2D
algorithms = corc.our_algorithms.ALGORITHM_SELECTOR
datasets_2d

['noisy_circles', 'noisy_moons', 'blobs', 'varied', 'aniso', 'clusterlab10']

In [4]:
!! pwd

['/scratch-grete/usr/nimmr000/git/cluster_vs_continuum/notebooks/metrics']

In [5]:
cache_path = "../../cache"

In [40]:
def check_missing_files(datasets,algorithms, cache_path="../../cache"):
    missing_files = []
    for dataset_name in datasets:
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        if not os.path.exists(dataset_filename):
            missing_files.append(dataset_filename)

        for algorithm_name in algorithms:
            algorithm_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{algorithm_name}.pickle"
            if not os.path.exists(alg_filename):
                missing_files.append(alg_filename)
    return missing_files

In [50]:
def compute_ari_scores(datasets,algorithms, cache_path="../../cache"):
    ari_scores = np.zeros((len(datasets),len(algorithms)))
    for i_dataset, dataset_name in enumerate(tqdm(datasets)):
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        with open(dataset_filename, "rb") as f:
            dataset_info = pickle.load(f)
            
        X, y = dataset_info["dataset"]
        
        for i_algorithm, algorithm_name in enumerate(algorithms):
            alg_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{alg_name}.pickle"
            if not os.path.exists(alg_filename):
                print("skipping", alg_filename)
                ari_scores[i_dataset,i_algorithm] = -1.
                continue
            with open(alg_filename, "rb") as f:
                algorithm = pickle.load(f)

            # extract predictions
            num_classes = len(np.unique(y))
            if isinstance(algorithm, corc.graph_metrics.gwgmara.GWGMara):
                y_pred = algorithm.predict(X, target_number_clusters=num_classes)
            elif hasattr(algorithm, "labels_"):
                y_pred = algorithm.labels_.astype(int)
            elif hasattr(algorithm, "predict_with_target"):
                y_pred = algorithm.predict_with_target(X, num_classes).astype(int)
            else:
                y_pred = algorithm.predict(X)

            ari_scores[i_dataset,i_algorithm] = sklearn.metrics.adjusted_rand_score(y, y_pred)
    df = pandas.DataFrame(ari_scores.transpose())
    df.columns = datasets # sets names for the axes
    df.index = algorithms
    return df

In [51]:
def style_dataFrame(df):

    def red_green_scale(val):
        if val < 0:
            return f'background-color: rgba(255, 0, 0, {abs(val)})'  # Red, alpha scales with magnitude
        elif val > 0:
            return f'background-color: rgba(0, 128, 0, {val})'      # Green, alpha scales with magnitude
        else:
            return ''  # White (default background)
    
    # df_nlargest = df.style.apply(lambda x: ['background-color: yellow' if v == x.nlargest(1).iloc[0] else '' for v in x], axis=0).format("{:.2f}")
    df_nlargest = df.style.apply(lambda x: ['background-color: yellow' if v == x.nlargest(1).iloc[0] else '' for v in x], axis=0).format("{:.2f}")

    styled_df = df.style.applymap(red_green_scale).format("{:.2f}")

    return styled_df


In [52]:
check_missing_files(datasets_2d,algorithms)

[]

In [53]:
ari_scores_2d = compute_ari_scores(datasets_2d,algorithms)

100%|██████████| 6/6 [00:00<00:00, 22.60it/s]


In [54]:
style_dataFrame(ari_scores_2d)

  styled_df = df.style.applymap(red_green_scale).format("{:.2f}")


Unnamed: 0,noisy_circles,noisy_moons,blobs,varied,aniso,clusterlab10
MiniBatch KMeans,-0.0,0.48,0.98,0.84,0.59,1.0
Agglomerative Clustering,-0.0,0.37,0.96,0.95,0.53,1.0
HDBSCAN,1.0,1.0,0.57,0.92,0.89,1.0
Gaussian Mixture,-0.0,0.5,0.98,0.95,1.0,1.0
t-Student Mixture,-0.0,0.15,0.98,0.95,1.0,1.0
Spectral Clustering,1.0,1.0,0.97,0.95,0.85,0.68
Affinity Propagation,-0.0,0.47,0.98,0.81,0.59,0.47
MeanShift,0.01,0.54,0.98,0.86,0.52,0.53
Leiden,1.0,1.0,0.96,0.96,0.59,1.0
PAGA,1.0,1.0,0.96,0.0,0.9,1.0


# high-dimensional datasets

In [55]:
complex_datasets = corc.our_datasets.COMPLEX_DATASETS
complex_datasets

['blobs1_8',
 'blobs1_16',
 'blobs1_32',
 'blobs1_64',
 'blobs2_8',
 'blobs2_16',
 'blobs2_32',
 'blobs2_64',
 'densired8',
 'densired16',
 'densired32',
 'densired64',
 'densired_soft_8',
 'densired_soft_16',
 'densired_soft_32',
 'densired_soft_64',
 'mnist8',
 'mnist16',
 'mnist32',
 'mnist64']

In [56]:
check_missing_files(complex_datasets,algorithms)

['../../cache/blobs2_64_TMM-NEB.pickle']

In [57]:
ari_scores_complex = compute_ari_scores(complex_datasets,algorithms)

 25%|██▌       | 5/20 [00:00<00:00, 17.78it/s]

6 clusters is not achievable.
Working with 2 clusters instead.
6 clusters is not achievable.
Working with 3 clusters instead.


 35%|███▌      | 7/20 [00:00<00:00, 15.90it/s]

skipping ../../cache/blobs2_64_TMM-NEB.pickle
6 clusters is not achievable.
Working with 3 clusters instead.


100%|██████████| 20/20 [00:07<00:00,  2.85it/s]


In [58]:
style_dataFrame(ari_scores_complex)

  styled_df = df.style.applymap(red_green_scale).format("{:.2f}")


Unnamed: 0,blobs1_8,blobs1_16,blobs1_32,blobs1_64,blobs2_8,blobs2_16,blobs2_32,blobs2_64,densired8,densired16,densired32,densired64,densired_soft_8,densired_soft_16,densired_soft_32,densired_soft_64,mnist8,mnist16,mnist32,mnist64
MiniBatch KMeans,0.65,0.68,0.94,0.73,0.56,0.34,0.37,0.58,0.63,0.63,0.5,0.74,0.63,0.49,0.53,0.49,0.57,0.44,0.32,0.26
Agglomerative Clustering,0.73,0.83,0.8,0.8,0.55,0.65,0.54,0.52,0.68,0.66,0.59,0.75,0.56,0.87,0.9,0.64,0.8,0.68,0.62,0.49
HDBSCAN,0.0,0.0,0.0,0.0,0.23,0.08,-0.03,-0.02,0.0,0.0,0.44,0.0,0.01,0.0,0.0,0.0,0.02,0.03,0.05,0.05
Gaussian Mixture,0.69,0.92,0.92,0.72,0.37,0.53,0.37,0.57,0.46,0.63,0.5,0.64,0.4,0.54,0.41,0.43,0.75,0.71,0.62,0.57
t-Student Mixture,0.71,0.7,0.23,0.03,0.56,0.65,0.1,0.07,0.66,0.66,0.58,0.63,0.49,0.5,0.9,0.39,0.76,0.9,0.77,0.75
Spectral Clustering,0.8,0.91,0.9,0.94,0.55,0.55,0.52,0.55,0.83,0.77,0.55,0.91,0.75,0.73,0.94,0.94,0.76,0.78,0.74,0.56
Affinity Propagation,0.77,0.79,0.54,0.36,0.56,0.49,0.33,0.16,0.59,0.61,0.49,0.54,0.58,0.54,0.83,-0.0,0.57,0.34,0.26,0.18
MeanShift,0.59,0.0,0.0,0.0,0.91,0.12,0.0,0.0,0.79,0.78,0.0,0.44,0.69,0.79,0.51,0.0,0.47,0.0,0.0,0.0
Leiden,0.79,0.88,0.9,0.93,0.56,0.59,0.53,0.4,0.83,0.77,0.76,0.85,0.89,0.93,0.91,0.82,0.89,0.92,0.93,0.7
PAGA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.96,1.0,0.89,1.0,0.49,0.82,0.91,0.64,0.0,0.0,0.0,0.0
