# Clustering ML Repo
> Effect of embedding type and dimension on accuracy of clustering for classification tasks in ML Repo

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Clustering must allow custom distance metric
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.metrics import pairwise_distances
from util import cluster, mixture_embedding

# Label tokens
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA

INFO: Using numpy backend


In [5]:
def load_task(data_path, **embed_kwargs) -> (pd.DataFrame, pd.DataFrame):
    # Get OTU table
    otu_path = f"{data_path}/gg/otutable.txt"
    otu_table = pd.read_table(otu_path, index_col=0).T
    X = otu_table / otu_table.values.sum(axis=1, keepdims=True)

    # Tokenize labels
    labels_path = f"{data_path}/task.txt"
    labels = pd.read_table(labels_path, index_col=0)
    label_encoder = OrdinalEncoder()
    label_encoder.fit(labels)
    y = label_encoder.transform(labels)

    return X, y

def experiment(X, y, euc_embeddings, hyp_embeddings):
    """For a dataset and a known dimensionality, get cluster scores"""
    hyp_dim = hyp_embeddings.shape[1]
    euc_dim = euc_embeddings.shape[1]
    assert hyp_dim == euc_dim

    X_raw = X.copy()
    X_pca = PCA(n_components=hyp_dim).fit_transform(X)
    X_euc = mixture_embedding(X, euc_embeddings, geometry="euclidean")
    X_hyp = mixture_embedding(X, hyp_embeddings, geometry="hyperbolic")

    # Cluster
    # out_df = pd.DataFrame(columns=["name", "dim", "type", "ARI", "accuracy"])
    out = []
    for X, name in zip([X_raw, X_pca, X_euc, X_hyp], ["raw", "pca", "euc", "hyp"]):
        _, y_pred = cluster(X, n_clusters=2, labels=y)
        out.append({
            "dim": hyp_dim,
            "type": name,
            "ARI": adjusted_rand_score(y_pred, y),
            "accuracy": accuracy_score(y_pred, y)
        })

    return out

import os

big_df = pd.DataFrame(columns=["name", "dim", "type", "ARI", "accuracy"])
for embed_dim in [16, 128]:
    for dir in os.listdir("../../data/interim/mlrepo"):
        # Check it's a directory
        if not os.path.isdir(f"../../data/interim/mlrepo/{dir}"):
            continue

        # Get data
        try:
            X, y = load_task(f"../../data/interim/mlrepo/{dir}")
        except Exception as e:
            print(e)
            continue

        # Get embeddings
        euc_embeddings = pd.read_csv(
            f"~/DATA/otu_embeddings/embeddings_euclidean_{embed_dim}.csv", 
            index_col=0
        )
        hyp_embeddings = pd.read_csv(
            f"~/DATA/otu_embeddings/embeddings_hyperbolic_{embed_dim}.csv", 
            index_col=0
        )

        # Run experiment
        try:
            # Pandas 2.0 does not support appending:
            out_df = pd.concat(
                experiment(X, y, euc_embeddings, hyp_embeddings)
            )
            big_df = pd.concat([big_df, out_df])
        except Exception as e:
            print(e)
            continue

big_df.to_csv("../../data/processed/mlrepo_scores.csv")

[Errno 2] No such file or directory: '../../data/interim/mlrepo/sokol/task.txt'
[Errno 2] No such file or directory: '../../data/interim/mlrepo/ravel/task.txt'
name 'data_path' is not defined
[Errno 2] No such file or directory: '../../data/interim/mlrepo/karlsson/gg/otutable.txt'
[Errno 2] No such file or directory: '../../data/interim/mlrepo/gevers/task.txt'
name 'data_path' is not defined
[Errno 2] No such file or directory: '../../data/interim/mlrepo/qin2014/gg/otutable.txt'


KeyboardInterrupt: 

In [4]:
pd.__version__

'2.0.2'