In [1]:
import os

import numpy as np
import tables as tb
from numpy.typing import NDArray

os.environ["POLARS_MAX_THREADS"] = "128"
import polars as pl  # noqa: E402

LongArray = NDArray[np.int64]

pl.enable_string_cache()

In [2]:
%load_ext watermark
%watermark -vp numpy,polars,tables

Python implementation: CPython
Python version       : 3.11.3
IPython version      : 8.13.2

numpy : 1.23.5
polars: 0.20.6
tables: 3.8.0



See the notebook "AAI.ipynb" for more information about the protein-protein similarity searches and the AAI calculation.

In [3]:
genome_info = (
    pl.read_csv("supplementary_table_1.tsv", separator="\t")
    .filter(pl.col("dataset") == "test")
    .select("genome_id", "genome")
    .cast({"genome": pl.Categorical})
)

genome_info

genome_id,genome
i64,cat
0,"""IMGVR_UViG_256…"
1,"""IMGVR_UViG_256…"
2,"""IMGVR_UViG_257…"
3,"""IMGVR_UViG_257…"
4,"""IMGVR_UViG_264…"
5,"""IMGVR_UViG_267…"
6,"""IMGVR_UViG_271…"
7,"""IMGVR_UViG_271…"
8,"""IMGVR_UViG_330…"
9,"""IMGVR_UViG_330…"


In [4]:
aai = pl.read_parquet("aai.parquet")
aai

query_genome,target_genome,aai,shared_genes_n_rows,shared_genes,query_ptns,target_ptns,query_shared,target_shared
cat,cat,f32,u32,u16,u16,u16,f32,f32
"""IMGVR_UViG_271…","""IMGVR_UViG_330…",0.529368,19,17,121,66,0.140496,0.257576
"""IMGVR_UViG_280…","""IMGVR_UViG_330…",0.326,1,1,51,47,0.019608,0.021277
"""IMGVR_UViG_280…","""IMGVR_UViG_287…",0.225,2,2,51,53,0.039216,0.037736
"""IMGVR_UViG_330…","""IMGVR_UViG_330…",0.751731,26,26,41,36,0.634146,0.722222
"""IMGVR_UViG_330…","""IMGVR_UViG_258…",0.729048,21,21,41,41,0.512195,0.512195
"""IMGVR_UViG_330…","""IMGVR_UViG_269…",0.763077,26,26,41,49,0.634146,0.530612
"""IMGVR_UViG_330…","""IMGVR_UViG_264…",0.735381,21,21,41,39,0.512195,0.538462
"""IMGVR_UViG_330…","""IMGVR_UViG_266…",0.674438,16,16,41,38,0.390244,0.421053
"""IMGVR_UViG_330…","""IMGVR_UViG_330…",0.262,1,1,211,82,0.004739,0.012195
"""IMGVR_UViG_330…","""IMGVR_UViG_283…",0.261,1,1,211,71,0.004739,0.014085


In [5]:
def compute_aai_per_clustering(
    aai: pl.DataFrame,
    cluster_labels: LongArray,
    genome_info: pl.DataFrame,
    k: int,
    res: float,
    **kwargs,
# ) -> pl.DataFrame:
) -> pl.LazyFrame:
    cluster_info = (
        genome_info
        .with_columns(
            cluster=pl.lit(cluster_labels), 
            k=pl.lit(k), 
            resolution=pl.lit(res),
        )
        .with_columns(cluster_size=pl.count("genome").over("cluster"))
    )
    n_clusters = cluster_info["cluster"].n_unique()
    cluster_info = cluster_info.lazy()

    additional_metadata = {k: pl.lit(v) for k, v in kwargs.items()}
    summary = (
        aai.lazy()
        .join(cluster_info, left_on="query_genome", right_on="genome")
        .rename({"cluster": "query_cluster"})
        .join(
            (cluster_info.drop("cluster_size", "k", "resolution")),
            left_on="target_genome",
            right_on="genome",
        )
        .rename({"cluster": "target_cluster"})
        .filter(pl.col("query_cluster") == pl.col("target_cluster"))
        .group_by("query_cluster")
        .agg(
            pl.first("cluster_size", "k", "resolution"),
            pl.col("aai").mean().name.suffix("_mean"),
        )
        .filter(pl.col("cluster_size") > 1)
        .with_columns(
            cluster_size_weight=pl.col("cluster_size") / pl.col("cluster_size").sum(),
        )
        .with_columns(
            weighted_aai_mean=pl.col("aai_mean") * pl.col("cluster_size_weight"),
        )
        # .collect()
        .rename({"query_cluster": "cluster"})
        .with_columns(singletons=n_clusters - pl.len(), **additional_metadata)
    )
    return summary

In [6]:
def compute_aai(
    aai: pl.DataFrame,
    genome_info: pl.DataFrame,
    clustering_file: str,
    test_dataset_only: bool = True,
    low_memory: bool = False,
) -> pl.DataFrame:
    """
    Note: this can be very memory intensive depending on the number of 
    input genomes. Use the low_memory flag to stream the data for better 
    memory efficient if needed. This will trade off computation speed.

    The clustering file is an .h5 with the following structure:
    .
    |__train
    |  |__method_0
    |  |  |__metadata
    |  |  |__data
    ...
    |__test
       |__method_0
          |__metadata
          |__data

    The clustering metadata is a numpy struct array with the fields: (genome_k, genome_resolution).
    These correspond to the number of nearest neighbors and the resolution of the Leiden clustering algorithm.
    """

    storage: list[pl.LazyFrame] = []
    with tb.open_file(clustering_file) as fp:
        # this is a numpy struct array with the following fields:
        # (genome_k, genome_resolution)
        clu_metadata = fp.root.test["pst-large"].metadata[:]

        # I tried several different parameters for genome clustering, but I will
        # filter for only the iterations included in the PST manuscript
        clu_metadata_mask = np.array([
            genome_res > 0.01
            for genome_k, genome_res in clu_metadata
        ])

        mask_idx = np.where(clu_metadata_mask)[0]
        clu_metadata = clu_metadata[clu_metadata_mask]

        for dataset in fp.root:
            dataset_name = dataset._v_name

            if test_dataset_only and dataset_name != "test":
                continue

            for method in dataset:
                method_name = method._v_name

                # this is just an int array with a cluster assignment for each genome
                # shape: (num clustering iters, num genomes)
                clusters: LongArray = method.data[mask_idx, :]

                for clu, (genome_k, genome_res) in zip(clusters, clu_metadata):
                    summary = compute_aai_per_clustering(
                        aai, clu, genome_info, genome_k, genome_res, method=method_name, dataset=dataset_name,
                    )
                    storage.append(summary)

    summary = (
        pl.concat(
            pl.collect_all(storage, streaming=low_memory)
        )
        .group_by("method", "k", "resolution")
        .agg(
            aai_score=pl.sum("weighted_aai_mean"),
            singletons=pl.first("singletons"),
            n_clusters=pl.len(),
            used_data=pl.sum("cluster_size")
        )
        .with_columns(
            total_genomes=pl.col("used_data") + pl.col("singletons"),
        )
        .with_columns(
            inclusion_weight=pl.col("used_data") / pl.col("total_genomes"),
        )
        .with_columns(
            weighted_aai_score=pl.col("aai_score") * pl.col("inclusion_weight"),
        )
        .sort(["method", "k", "resolution"])
    )

    return summary

In [7]:
summary = compute_aai(aai, genome_info, "genome_clusters.h5", test_dataset_only=True)
summary

method,k,resolution,aai_score,singletons,n_clusters,used_data,total_genomes,inclusion_weight,weighted_aai_score
str,i32,f64,f64,u32,u32,u32,u32,f64,f64
"""ctx-avg-large""",2,0.1,0.89048,1649,26509,149606,151255,0.989098,0.880772
"""ctx-avg-large""",2,1.0,0.961506,72892,36133,78363,151255,0.518085,0.498142
"""ctx-avg-large""",5,0.1,0.758411,1460,9755,149795,151255,0.990347,0.75109
"""ctx-avg-large""",5,1.0,0.942322,40751,39370,110504,151255,0.730581,0.688442
"""ctx-avg-large""",10,0.1,0.660018,1301,4673,149954,151255,0.991399,0.654341
"""ctx-avg-large""",10,1.0,0.906208,24038,32177,127217,151255,0.841076,0.76219
"""ctx-avg-large""",15,0.1,0.651686,1241,4277,150014,151255,0.991795,0.646339
"""ctx-avg-large""",15,1.0,0.864299,17233,25133,134022,151255,0.886067,0.765826
"""ctx-avg-large""",25,0.1,0.649798,1177,4128,150078,151255,0.992218,0.644742
"""ctx-avg-large""",25,1.0,0.780194,11002,15465,140253,151255,0.927262,0.723444
