# Get the table containing dataset statistics

In [4]:
%load_ext autoreload
%autoreload 2

import os
os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
os.environ['JAX_ENABLE_X64'] = '1'

cache_path = "../../cache/"

In [1]:
import numpy as np
import pandas as pd
import pickle
import corc.utils
import sklearn
import tqdm

In [11]:
dataset_names = [
    "noisy_circles",
    "noisy_moons",
    "varied",
    "aniso",
    "blobs",
    "uniform_circle",
    "clusterlab10",
    "blobs1_8",
    "blobs1_16",
    "blobs1_32",
    "blobs1_64",
    "blobs2_8",
    "blobs2_16",
    "blobs2_32",
    "blobs2_64",
    "densired8",
    "densired16",
    "densired32",
    "densired64",
    "densired_soft_8",
    "densired_soft_16",
    "densired_soft_32",
    "densired_soft_64",
    "mnist8",
    "mnist16",
    "mnist32",
    "mnist64",
]

dataset_displaynames = {
    ###########################
    ##### fig 1 datasets ######
    ###########################
    "noisy_moons": "noisy\nmoons",
    "noisy_circles": "noisy\ncircles",
    "varied": "varied\ndensity",
    "aniso": "anisotropic\nblobs",
    "blobs": "Gaussian\nblobs",
    "clusterlab10": "clusterlab10",
    ###########################
    ##### fig 2 datasets ######
    ###########################
    "blobs1_8": "Gaussian\nblobs 8D",
    "blobs1_16": "Gaussian\nblobs 16D",
    "blobs1_32": "Gaussian\nblobs 32D",
    "blobs1_64": "Gaussian\nblobs 64D",
    "blobs2_8": "Gaussian\nblobs 8D",
    "blobs2_16": "Gaussian\nblobs 16D",
    "blobs2_32": "Gaussian\nblobs 32D",
    "blobs2_64": "Gaussian\nblobs 64D",
    "densired8": "Densired\n'circles' 8D",
    "densired16": "Densired\n'circles' 16D",
    "densired32": "Densired\n'circles' 32D",
    "densired64": "Densired\n'circles' 64D",
    "densired_soft_8": "Densired\n'Stud-t' 8D",
    "densired_soft_16": "Densired\n'Stud-t' 16D",
    "densired_soft_32": "Densired\n'Stud-t' 32D",
    "densired_soft_64": "Densired\n'Stud-t' 64D",
    "mnist8": "MNIST-Nd\n8D",
    "mnist16": "MNIST-Nd\n16D",
    "mnist32": "MNIST-Nd\n32D",
    "mnist64": "MNIST-Nd\n64D",
}

In [14]:

dataset_stats_list = []
for dataset_name in dataset_names:
    X, y, tsne = corc.utils.load_dataset(dataset_name, cache_path=cache_path)
    display_name = dataset_displaynames.get(dataset_name, dataset_name).replace('\n', ' ')
    dataset_stats = {
        "dataset_name": display_name,
        "n_classes": len(np.unique(y)),
        "n_points": len(X),
        "dimension": X.shape[-1]
    }
    dataset_stats_list.append(dataset_stats)

df = pd.DataFrame(dataset_stats_list)


In [15]:
df

Unnamed: 0,dataset_name,n_classes,n_points,dimension
0,noisy circles,2,1000,2
1,noisy moons,2,1000,2
2,varied density,3,1000,2
3,anisotropic blobs,3,1000,2
4,Gaussian blobs,3,1000,2
5,uniform_circle,1,1000,2
6,clusterlab10,6,300,2
7,Gaussian blobs 8D,6,996,8
8,Gaussian blobs 16D,6,996,16
9,Gaussian blobs 32D,6,996,32


In [16]:
print(df.to_latex(index=False))

\begin{tabular}{lrrr}
\toprule
dataset_name & n_classes & n_points & dimension \\
\midrule
noisy circles & 2 & 1000 & 2 \\
noisy moons & 2 & 1000 & 2 \\
varied density & 3 & 1000 & 2 \\
anisotropic blobs & 3 & 1000 & 2 \\
Gaussian blobs & 3 & 1000 & 2 \\
uniform_circle & 1 & 1000 & 2 \\
clusterlab10 & 6 & 300 & 2 \\
Gaussian blobs 8D & 6 & 996 & 8 \\
Gaussian blobs 16D & 6 & 996 & 16 \\
Gaussian blobs 32D & 6 & 996 & 32 \\
Gaussian blobs 64D & 6 & 996 & 64 \\
Gaussian blobs 8D & 6 & 1000 & 8 \\
Gaussian blobs 16D & 6 & 1000 & 16 \\
Gaussian blobs 32D & 6 & 1000 & 32 \\
Gaussian blobs 64D & 6 & 1000 & 64 \\
Densired 'circles' 8D & 6 & 10000 & 8 \\
Densired 'circles' 16D & 6 & 10000 & 16 \\
Densired 'circles' 32D & 6 & 10000 & 32 \\
Densired 'circles' 64D & 6 & 10000 & 64 \\
Densired 'Stud-t' 8D & 6 & 10000 & 8 \\
Densired 'Stud-t' 16D & 6 & 10000 & 16 \\
Densired 'Stud-t' 32D & 6 & 10000 & 32 \\
Densired 'Stud-t' 64D & 6 & 10000 & 64 \\
MNIST-Nd 8D & 10 & 10000 & 8 \\
MNIST-Nd 16D & 10 