# Distance metric comparison
> Do the pairwise distances between mixture embeddings correlate with things like beta-diversity?

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Clustering must allow custom distance metric
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances

from skbio import TreeNode
from skbio.diversity import beta_diversity
from skbio.diversity.beta import unweighted_unifrac, weighted_unifrac

from tqdm import tqdm

from geomstats.geometry.hyperbolic import Hyperbolic

from util import mixture_embedding

INFO: Using numpy backend


In [2]:
# For UniFrac:
# Load tree; iterate through all branches and print length; fix if None
tree = TreeNode.read(
    "/home/phil/DATA/greengenes/data/gg_13_5_otus_99_annotated.tree"
)
for i in tree.postorder(include_self=False):
    if i.length is None:
        print(i.name, i.length)
        i.length = 0.0

In [3]:
HEAD=10

In [4]:
# Head 20 = ~19 minutes for the whole thing
# Head 50 = ~2 hours
# Running this on the full table takes a long time, approx. 2 hours

# OK, that was too slow. Let's do dists separately for each study:
def get_dists(dist_function, embed=None, head=10, **kwargs):
    dists = []
    dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
    for subdir in tqdm(os.listdir(dir)):
        path = os.path.join(dir, subdir, "otus.txt")
        if os.path.exists(path):
            otu_table = pd.read_table(path, dtype={0: str})
            
            otu_table = otu_table.set_index(
                otu_table.columns[0]
            ).astype(float).T

            if head is not None:
                otu_table = otu_table.head(head)

            if dist_function == "unifrac":
                sample_dists = pdist(
                    otu_table, 
                    metric=unweighted_unifrac, 
                    otu_ids=otu_table.columns, 
                    tree=tree.shear(otu_table.columns) # Shearing is faster
                )
            else:
                sample_dists = pdist(otu_table, metric=dist_function, **kwargs)
            dists.append(sample_dists)
    
    # return np.array([x.data.flatten() for x in dists])
    return np.concatenate(dists)

# unifrac_dists = get_dists("unifrac")
unifrac_dists = get_dists("unifrac", head=HEAD)

# Takes more than 20 minutes to run on first OTU table...
np.save(f"data/processed/distances/unifrac_dists_top{HEAD}.npy", unifrac_dists)

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [02:42<00:00,  8.56s/it]


In [5]:
# Beta diversity
dists = []
dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
for subdir in tqdm(os.listdir(dir)):
    path = os.path.join(dir, subdir, "otus.txt")
    if os.path.exists(path):
        otu_table = pd.read_table(path, dtype={0: str})
        
        otu_table = otu_table.set_index(
            otu_table.columns[0]
        ).astype(float).T

        otu_table = otu_table.head(HEAD)

        sample_dists = beta_diversity(counts=otu_table, metric="braycurtis")
        dists.append(sample_dists)
    
beta_diversity_dists = np.array([x.data.flatten() for x in dists])

np.save(f"data/processed/distances/beta_diversity_dists_top{HEAD}.npy", beta_diversity_dists)

100%|██████████| 19/19 [00:01<00:00, 14.66it/s]


In [6]:
for size in [2, 4, 8, 16, 32, 64, 128]:
    # Get embedding distance matrices
    euc_embeddings = pd.read_csv(f"/home/phil/DATA/otu_embeddings/embeddings_euclidean_{size}.csv", dtype={0: str})
    euc_embeddings = euc_embeddings.set_index(euc_embeddings.columns[0])

    hyp_embeddings = pd.read_csv(f"/home/phil/DATA/otu_embeddings/embeddings_hyperbolic_{size}.csv", dtype={0: str})
    hyp_embeddings = hyp_embeddings.set_index(hyp_embeddings.columns[0])

    hyp_manifold = Hyperbolic(size, default_coords_type="ball")

    euc_dists = []
    hyp_dists = []
    dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
    for subdir in tqdm(os.listdir(dir)):
        path = os.path.join(dir, subdir, "otus.txt")
        if os.path.exists(path):
            otu_table = pd.read_table(path, dtype={0: str})
            
            otu_table = otu_table.set_index(
                otu_table.columns[0]
            ).astype(float).T

            otu_table = otu_table.head(HEAD)

            otu_table_euc = mixture_embedding(
                otu_table,
                geometry="euclidean",
                otu_embeddings=euc_embeddings
            )

            otu_table_hyp = mixture_embedding(
                otu_table,
                geometry="hyperbolic",
                otu_embeddings=hyp_embeddings
            )

            euc_dists.append(pdist(sample_dists, metric="euclidean"))
            hyp_dists.append(pdist(sample_dists, metric=hyp_manifold._metric.dist))
    
    euc_dists = np.array([x.flatten() for x in euc_dists])
    hyp_dists = np.array([x.flatten() for x in hyp_dists])

    np.save(f"data/processed/distances/euc_dists_top{HEAD}_{size}.npy", euc_dists)
    np.save(f"data/processed/distances/euc_dists_top{HEAD}_{size}.npy", hyp_dists)

100%|██████████| 10/10 [00:01<00:00,  5.04it/s]
  0%|          | 0/19 [00:02<?, ?it/s]


ValueError: A 2-dimensional array must be passed.

In [None]:
# Load all dists

