# Distance metric comparison
> Do the pairwise distances between mixture embeddings correlate with things like beta-diversity?

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Clustering must allow custom distance metric
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances

from skbio import TreeNode
from skbio.diversity import beta_diversity
from skbio.diversity.beta import unweighted_unifrac, weighted_unifrac

from tqdm import tqdm

from geomstats.geometry.hyperbolic import Hyperbolic

from util import mixture_embedding

In [None]:
# For UniFrac:
# Load tree; iterate through all branches and print length; fix if None
tree = TreeNode.read(
    "/home/phil/phylosig/greengenes/data/gg_13_5_otus/trees/97_otus.tree"
)
for i in tree.postorder(include_self=False):
    if i.length is None:
        print(i.name, i.length)
        i.length = 0.0

k__Bacteria None


In [None]:
HEAD=10

In [None]:
# Head 20 = ~19 minutes for the whole thing
# Head 50 = ~2 hours
# Running this on the full table takes a long time, approx. 2 hours

# OK, that was too slow. Let's do dists separately for each study:
def get_dists(dist_function, embed=None, head=10, **kwargs):
    dists = []
    dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
    for subdir in tqdm(os.listdir(dir)):
        path = os.path.join(dir, subdir, "otus.txt")
        if os.path.exists(path):
            otu_table = pd.read_table(path, dtype={0: str})
            
            otu_table = otu_table.set_index(
                otu_table.columns[0]
            ).astype(float).T

            if head is not None:
                otu_table = otu_table.head(head)

            if dist_function == "unifrac":
                sample_dists = pdist(
                    otu_table, 
                    metric=unweighted_unifrac, 
                    otu_ids=otu_table.columns, 
                    tree=tree.shear(otu_table.columns) # Shearing is faster
                )
            else:
                sample_dists = pdist(otu_table, metric=dist_function, **kwargs)
            dists.append(sample_dists)
    
    # return np.array([x.data.flatten() for x in dists])
    return np.concatenate(dists)

# unifrac_dists = get_dists("unifrac")
unifrac_dists = get_dists("unifrac", head=HEAD)

# Takes more than 20 minutes to run on first OTU table...
np.save(f"data/processed/distances/unifrac_dists_top{HEAD}.npy", unifrac_dists)

100%|██████████| 19/19 [01:14<00:00,  3.90s/it]


In [None]:
# Beta diversity
dists = []
dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
for subdir in tqdm(os.listdir(dir)):
    path = os.path.join(dir, subdir, "otus.txt")
    if os.path.exists(path):
        otu_table = pd.read_table(path, dtype={0: str})
        
        otu_table = otu_table.set_index(
            otu_table.columns[0]
        ).astype(float).T

        otu_table = otu_table.head(HEAD)

        sample_dists = beta_diversity(counts=otu_table, metric="braycurtis")
        dists.append(sample_dists)
    
beta_diversity_dists = np.array([x.data.flatten() for x in dists])

np.save(f"data/processed/distances/beta_diversity_dists_top{HEAD}.npy", beta_diversity_dists)

100%|██████████| 19/19 [00:01<00:00, 18.22it/s]


In [None]:
# Get embedding distance matrices
euc_embeddings = pd.read_csv("/home/phil/DATA/otu_embeddings/embeddings_euclidean_16.csv", dtype={0: str})
euc_embeddings = euc_embeddings.set_index(euc_embeddings.columns[0])

hyp_embeddings = pd.read_csv("/home/phil/DATA/otu_embeddings/embeddings_hyperbolic_16.csv", dtype={0: str})
hyp_embeddings = hyp_embeddings.set_index(hyp_embeddings.columns[0])

hyp_manifold = Hyperbolic(16, default_coords_type="ball")

euc_dists = []
hyp_dists = []
dir = "/home/phil/mixture_embeddings/data/interim/mlrepo_clean/classification"
for subdir in tqdm(os.listdir(dir)):
    path = os.path.join(dir, subdir, "otus.txt")
    if os.path.exists(path):
        otu_table = pd.read_table(path, dtype={0: str})
        
        otu_table = otu_table.set_index(
            otu_table.columns[0]
        ).astype(float).T

        otu_table = otu_table.head(HEAD)

        otu_table_euc = mixture_embedding(
            otu_table,
            geometry="euclidean",
            otu_embeddings=euc_embeddings
        )

        otu_table_hyp = mixture_embedding(
            otu_table,
            geometry="hyperbolic",
            otu_embeddings=hyp_embeddings
        )

        euc_dists.append(pdist(sample_dists, metric="euclidean"))
        hyp_dists.append(pdist(sample_dists, metric=hyp_manifold._metric.dist))
    
euc_dists = np.array([x.flatten() for x in euc_dists])
hyp_dists = np.array([x.flatten() for x in hyp_dists])

np.save(f"data/processed/distances/euc_dists_top{HEAD}.npy", euc_dists)
np.save(f"data/processed/distances/euc_dists_top{HEAD}.npy", hyp_dists)