# ARI scores for the precomputed datasets and clustering algorithms

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

In [11]:
import sys
sys.path.insert(0, '../../scripts')
import our_datasets
import our_algorithms
from tqdm import tqdm
import pickle
import sklearn
import numpy as np
from nbformat.sign import algorithms
import pandas


In [17]:
datasets = our_datasets.COMPLEX_DATASETS
datasets

['blobs1_8',
 'blobs1_16',
 'blobs1_32',
 'blobs1_64',
 'blobs2_8',
 'blobs2_16',
 'blobs2_32',
 'blobs2_64',
 'densired8',
 'densired16',
 'densired32',
 'densired64',
 'mnist8',
 'mnist16',
 'mnist32',
 'mnist64']

our_algorithms.ALGORITHM_SELECTOR

In [27]:
# copied the following list from "our_algorithms.py" script
algorithms = [
    "MiniBatch\nKMeans",
    "Agglomerative\nClustering",
    "HDBSCAN",
    "Gaussian\nMixture",
    "t-Student\nMixture",
    # "DBSCAN",
    # "BIRCH",
    # "OPTICS",
    "Spectral\nClustering",
    "Affinity\nPropagation",
    "MeanShift",
    "Leiden",
    "PAGA",
    "Ward",
    # "Stavia",
    "GWG-dip",
    "GWG-pvalue",
    "TMM-NEB",
    "GMM-NEB",
]


In [5]:
!! pwd

['/user/ritzert/git/cluster_vs_continuum/notebooks/metrics']

In [6]:
cache_path = "../../cache"

In [None]:
def check_missing_files(datasets,algorithms, cache_path="../../cache"):
    missing_files = []
    for dataset_name in datasets:
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        if not os.path.exists(dataset_filename):
            missing_files.append(dataset_filename)

        for algorithm_name in algorithms:
            algorithm_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{algorithm_name}.pickle"
            if not os.path.exists(alg_filename):
                missing_files.append(alg_filename)
    return missing_files

In [None]:
def compute_ari_scores(datasets,algorithms, cache_path="../../cache"):
    ari_scores = np.zeros((len(datasets),len(algorithms)))
    for i_dataset, dataset_name in enumerate(tqdm(datasets)):
        dataset_filename = f"{cache_path}/{dataset_name}.pickle"
        with open(dataset_filename, "rb") as f:
            dataset_info = pickle.load(f)
            
        X, y = dataset_info["dataset"]
        
        for i_algorithm, algorithm_name in enumerate(algorithms):
            alg_name = algorithm_name.replace("\\n", "\n").replace("\n", "")
            alg_filename = f"{cache_path}/{dataset_name}_{alg_name}.pickle"
            if alg_filename in missing_files:
                print("skipping", alg_filename)
                continue
            with open(alg_filename, "rb") as f:
                algorithm = pickle.load(f)
            if hasattr(algorithm, "labels_"):
                y_pred = algorithm.labels_.astype(int)
            else:
                if hasattr(algorithm, "predict_with_target"):
                    y_pred = algorithm.predict_with_target(X, len(np.unique(y))).astype(
                            int
                        )
                else:
                    y_pred = algorithm.predict(X)
            ari_scores[i_dataset,i_algorithm] = sklearn.metrics.adjusted_rand_score(y, y_pred)
    df = pandas.DataFrame(ari_scores.transpose())
    df.columns = datasets # sets names for the axes
    df.index = algorithms
    df_nlargest = df.style.apply(lambda x: ['background-color: yellow' if v == x.nlargest(1).iloc[0] else '' for v in x], axis=0)
    return df_nlargest