In [1]:
import os
import tables as tb

os.environ["POLARS_MAX_THREADS"] = "128"
import polars as pl
import polars.selectors as cs

In [2]:
%load_ext watermark
%watermark -vp polars,tables

Python implementation: CPython
Python version       : 3.11.3
IPython version      : 8.13.2

polars: 0.20.6
tables: 3.8.0



# Acquiring a candidate set of proteins for structural searches
To search for capsid folds, we used foldseek. Instead of directly predicting a structure (which can be computationally expensive), we used the ProstT5 model implemented in foldseek that translates amino acid alphabets (proteins) to a structural alphabet. This can use pre-existing text alignment strategies (notably implemented in mmseqs2, reimplemented for foldseek) to search structural alphabet databases instead of the most computationally expensive 3D structural alignment.

For our investigations, rather than focus on all proteins in our dataset (which would take a long time), we only kept proteins belonging to protein clusters that had either VOG-detectable capsids and unannotated proteins. Our implicit assumption is that these represent protein clusters that are structurally similar, which we indirectly infer by detecting if these unannotated proteins contain capsid-like structural folds.

In [3]:
# for simplicity we are changing the "vog_category"
# for major capsid proteins to "MCP" instead of "structural"
mcp_query = (
    pl.col("vog_annot").str.contains("(?i)(Major )?capsid protein") &
    (~pl.col("vog_annot").str.contains("Minor")) &
    pl.col("vog_bitscore").ge(75)
)

ptn_info = (
    pl.scan_csv("supplementary_tables/supplementary_table_2.tsv", separator="\t")
    .filter(pl.col("dataset") == "test")
    .with_columns(
        vog_category = (
            pl.when(mcp_query)
            .then(pl.lit("MCP"))
            .otherwise(pl.col("vog_category"))
        )
    )
    .drop(cs.starts_with("phrog"))
    .drop("genome", "genome_id", "dataset")
    .collect()
)



ptn_info.head()

ptn,ptn_id,vog_bitscore,vog_annot,vog_category
str,i64,f64,str,str
"""IMGVR_UViG_256…",0,,"""unknown functi…","""unknown"""
"""IMGVR_UViG_256…",1,,"""unknown functi…","""unknown"""
"""IMGVR_UViG_256…",2,,"""unknown functi…","""unknown"""
"""IMGVR_UViG_256…",3,,"""unknown functi…","""unknown"""
"""IMGVR_UViG_256…",4,,"""unknown functi…","""unknown"""


We select our candidates from all combinations of protein-genome clustering. We only consider genome clustering with the hyperparameters of k=15 and resolution=1.0 (high):

In [4]:
candidates: list[pl.LazyFrame] = []
ptn_clusters_file = "datasets/protein_clusters/embedding-based_protein_clusters_per_genome_cluster.h5"

ptn_embedding_methods = ["esm-small", "esm-large", "pst-small", "pst-large", "genslm"]
with tb.open_file(ptn_clusters_file) as fp:
    for gm_node in fp.root:
        genome_method = gm_node._v_name

        for ptn_method in ptn_embedding_methods:
            clustering_metadata = gm_node["metadata"][:]
            ptn_clusters = gm_node[ptn_method][:]
            genome_clusters = gm_node["genome"][:]

            for pclu, gclu, (genome_k, genome_res, protein_k, protein_res) in zip(
                ptn_clusters, genome_clusters, clustering_metadata
            ):
                if (genome_k != 15) or (genome_res != 1.0) or (protein_res == 0.85):
                    continue

                ptn_df = (
                    ptn_info
                    .lazy()
                    .with_columns(
                        genome_cluster = pl.lit(gclu),
                        protein_cluster = pl.lit(pclu),
                    )
                    .with_columns(
                        cluster_size = pl.len().over("genome_cluster", "protein_cluster")
                    )
                    .filter(pl.col("cluster_size") > 1)
                    .with_columns(
                        num_categories = pl.n_unique("vog_category").over("genome_cluster", "protein_cluster"),
                        interested = (
                            pl.col("vog_category")
                            .is_in(["MCP", "unknown"])
                            .all()
                            .over("genome_cluster", "protein_cluster")
                        )
                    )
                    .filter(
                        (pl.col("num_categories") == 2)
                        & pl.col("interested")
                    )
                    .select("ptn")
                )

                candidates.append(ptn_df)

candidate_proteins = (
    pl.concat(pl.collect_all(candidates))
    ["ptn"]
    .value_counts()
    .filter(pl.col("count") >= 10)
)

len(candidate_proteins)

72322

To this set of 72,322 proteins, we considered that these proteins belong to sequence identity-based clusters, since those are the most similar proteins in the traditional sense. These additional proteins are homologs to those above. Think of these more as like positive controls.

In [5]:
mmseqs_clusters = (
    pl.read_csv(
        "datasets/protein_clusters/sequence_identity_clusters.tsv", 
        separator="\t",
        has_header=False,
        new_columns=["rep", "ptn"],
    )
    .with_columns(
        cluster_size = pl.col("ptn").len().over("rep"),
        cluster_id = pl.col("rep").rle_id(),
    )
)

First we need to detect the mmseqs clusters that contain capsids:

In [6]:
detectable_mcp_clusters = (
    ptn_info
    .filter(pl.col("vog_category") == "MCP")
    .select("ptn")
    .join(mmseqs_clusters, on="ptn")
    .filter(pl.col("cluster_size") > 1)
    ["cluster_id"]
)

len(detectable_mcp_clusters)

46564

Then choose all mmseqs clusters that contain both capsids and unannotated proteins:

In [7]:
unknown_with_mcp_clusters = (
    ptn_info
    .join(mmseqs_clusters, on="ptn")
    .filter(pl.col("cluster_id").is_in(detectable_mcp_clusters))
    .group_by("cluster_id")
    .agg(
        pl.first("cluster_size"),
        cats = pl.col("vog_category").unique(),
    )
    # should also have at least 1 MCP due to the filter on line 4
    .filter(pl.col("cats").list.contains("unknown"))
    ["cluster_id"]
)

mmseqs_ptn_candidates = (
    mmseqs_clusters
    .filter(pl.col("cluster_id").is_in(unknown_with_mcp_clusters))
    ["ptn"]
)

len(mmseqs_ptn_candidates)

40357

Thus, the total number of candidate proteins to search for structural similarity to capsid proteins is:

In [8]:
total_candidates = set(candidate_proteins["ptn"]) | set(mmseqs_ptn_candidates)

len(total_candidates)

100704

We then extracted the 100,704 candidated MCPs into a separate FASTA file and ran the following `foldseek` (v9.427df8a) commands to:
1. Convert these protein sequences in to the 3Di-structure sequence:

```bash
# download ProstT5 model
foldseek databases ProstT5 prostt5 tmp

# translate protein amino acids to structure
foldseek createdb FASTAFILE structDB --prostt5-model prostt5
```

2. Search against Protein Data Bank structures

```bash
# use foldseek to download structures
foldseek databases PDB pdb tmp

# then search
foldseek search structDB pdb/pdb searchDB tmp
```

3. Then compute alignment stats

```bash
foldseek convertalis structDB pdb/pdb searchDB
```

The structural alignments from this analysis are available with the supplementary data at the path: `fig4/capsid_candidates_structural_search_against_pdb.tsv`

Then, to determine which PDB structures corresponded to capsids, we queried PDB at the web server ([https://www.rcsb.org](https://www.rcsb.org)) with: `capsid, major capsid, coat, minor capsid, virion`, and selected all PDB IDs the came up. This list of capsid PDB IDs is available with supplementary data at: `fig4/capsid_pdb_ids.txt`.

# Analyzing structural searches

We used the above 2 files to process the structural alignment results:

In [9]:
foldseek_columns = "query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen".split(",")

with open("supplementary_data/fig4/capsid_pdb_ids.txt") as fp:
    capsid_pdb_ids = {line.rstrip() for line in fp}

struct_aln = (
    pl.scan_csv(
        "supplementary_data/fig4/capsid_candidates_structural_search_against_pdb.tsv",
        separator="\t",
        has_header=False,
        new_columns=foldseek_columns,
    )
    .filter(pl.col("bits") >= 100.)
    .with_columns(
        pdb_id = pl.col("target").str.extract("^(.*)?-.*$").str.to_uppercase(),
        chain = pl.col("target").str.split("_").list.get(1).str.to_uppercase(),
    )
    .with_columns(
        is_capsid = pl.col("pdb_id").is_in(capsid_pdb_ids),
    )
    .sort("bits", descending=True)
    # choose best PDB hit for each query protein
    .unique(subset="query", keep="first")
    .collect()
)

struct_aln

query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,pdb_id,chain,is_capsid
str,str,f64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,str,str,bool
"""IMGVR_UViG_330…","""8i4l-assembly1…",0.278,359,233,0,13,371,4,327,5.8330e-21,857,378,327,"""8I4L""","""D""",true
"""IMGVR_UViG_287…","""8k37-assembly1…",0.102,142,98,0,4,145,5,114,0.01722,176,152,114,"""8K37""","""M""",true
"""IMGVR_UViG_282…","""7rfo-assembly1…",0.12,163,122,0,81,243,187,326,0.00008,303,578,330,"""7RFO""","""B""",false
"""IMGVR_UViG_284…","""7z45-assembly1…",0.156,336,262,0,37,372,37,347,4.0700e-16,695,374,347,"""7Z45-ASSEMBLY1…","""A-4""",false
"""IMGVR_UViG_330…","""1opo-assembly1…",0.177,296,219,0,29,324,2,268,1.1720e-8,451,329,268,"""1OPO""","""C""",true
"""IMGVR_UViG_330…","""8h89-assembly1…",0.235,339,252,0,32,370,32,361,2.1360e-15,639,370,368,"""8H89""","""D""",true
"""IMGVR_UViG_330…","""6tb9-assembly1…",0.166,309,235,0,157,465,13,295,6.7220e-16,675,466,297,"""6TB9""","""C5""",false
"""IMGVR_UViG_330…","""1opo-assembly1…",0.155,317,226,0,103,419,1,268,3.6360e-9,430,423,268,"""1OPO""","""C""",true
"""IMGVR_UViG_330…","""7a02-assembly1…",0.1,112,100,0,28,139,1,112,0.004642,206,139,112,"""7A02""","""A""",false
"""IMGVR_UViG_264…","""6xgq-assembly1…",0.269,329,237,0,15,343,7,331,7.8890e-24,994,343,332,"""6XGQ""","""G""",true


Now we compute for each protein cluster only containing only unannotated proteins and VOG-detectable capsids the proportion of unannotated proteins that have structural similarity to capsids in the above table:

In [10]:
results: list[pl.LazyFrame] = []

ptn_embedding_methods = [
    "esm-small", "esm-large", "pst-small", "pst-large", "genslm"

]
with tb.open_file(ptn_clusters_file) as fp:
    for gm_node in fp.root:
        genome_method = gm_node._v_name

        for ptn_method in ptn_embedding_methods:
            clustering_metadata = gm_node["metadata"][:]
            ptn_clusters = gm_node[ptn_method][:]
            genome_clusters = gm_node["genome"][:]

            for pclu, gclu, (genome_k, genome_res, protein_k, protein_res) in zip(
                ptn_clusters, genome_clusters, clustering_metadata
            ):
                if (genome_k != 15) or (genome_res != 1.0) or (protein_res == 0.85):
                    continue

                cluster_df = (
                    ptn_info
                    .lazy()
                    .with_columns(
                        genome_cluster = pl.lit(gclu),
                        protein_cluster = pl.lit(pclu),
                    )
                    .with_columns(
                        cluster_size = pl.len().over("genome_cluster", "protein_cluster")
                    )
                    .filter(pl.col("cluster_size") > 1)
                    .join(struct_aln.lazy(), left_on="ptn", right_on="query")
                    .with_columns(
                        ptn_method=pl.lit(ptn_method),
                        genome_method=pl.lit(genome_method),
                        genome_k=pl.lit(genome_k),
                        genome_r=pl.lit(genome_res),
                        ptn_r=pl.lit(protein_res),
                        ptn_k=pl.lit(protein_k),
                    )
                )

                results.append(cluster_df)

cluster_summary = pl.concat(pl.collect_all(results))
cluster_summary

ptn,ptn_id,vog_bitscore,vog_annot,vog_category,genome_cluster,protein_cluster,cluster_size,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,pdb_id,chain,is_capsid,ptn_method,genome_method,genome_k,genome_r,ptn_r,ptn_k
str,i64,f64,str,str,u32,u32,u32,str,f64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,str,str,bool,str,str,i32,f64,f64,i32
"""IMGVR_UViG_257…",43,,"""unknown functi…","""unknown""",2,1,16,"""4dam-assembly2…",0.162,104,85,0,1,103,1,104,0.0001624,262,105,108,"""4DAM""","""H""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1012,260.42,"""sp|P15895|ORF4…","""unknown""",60,0,15,"""4v8p-assembly4…",0.097,115,103,0,14,128,33,147,0.2194,121,143,156,"""4V8P""","""GP""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1014,,"""unknown functi…","""unknown""",60,2,13,"""3pvv-assembly2…",0.086,58,49,0,45,99,23,80,0.6603,109,150,95,"""3PVV""","""B""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1016,332.97,"""sp|P15893|ORF2…","""unknown""",60,3,7,"""6p8v-assembly1…",0.165,146,107,0,75,203,72,217,0.00001,267,224,302,"""6P8V""","""B""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1021,,"""unknown functi…","""unknown""",60,1,18,"""3pvv-assembly2…",0.086,58,49,0,45,99,23,80,0.6603,109,150,95,"""3PVV""","""B""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1022,,"""unknown functi…","""unknown""",60,1,18,"""3pvv-assembly2…",0.086,58,49,0,45,99,23,80,0.6603,109,150,95,"""3PVV""","""B""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1023,177.56,"""sp|P15892|ORF1…","""unknown""",60,1,18,"""8snb-assembly1…",0.109,146,118,0,68,213,3,135,0.1312,124,423,529,"""8SNB""","""7G""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_264…",1025,,"""unknown functi…","""unknown""",60,0,15,"""4oq2-assembly1…",0.177,117,94,0,1,115,22,138,0.001093,183,115,297,"""4OQ2""","""A""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_265…",1096,,"""unknown functi…","""unknown""",64,3,15,"""7u8r-assembly1…",0.12,166,118,0,4,169,13,147,0.698,113,186,150,"""7U8R""","""H""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5
"""IMGVR_UViG_265…",1098,,"""unknown functi…","""unknown""",64,3,15,"""4jiu-assembly1…",0.242,95,67,0,135,224,2,96,0.000051,262,229,105,"""4JIU""","""A""",false,"""esm-small""","""ctx-avg-large""",15,1.0,0.1,5


In [11]:
group_cols = ["genome_method", "ptn_method", "genome_k", "genome_r", "ptn_k", "ptn_r"]

# need this to compute prop of only unannotated proteins that are capsids
detected_capsids = (
    cluster_summary
    .filter(pl.col("vog_category") != "unknown")
    .group_by(*group_cols, "genome_cluster", "protein_cluster")
    .agg(
        known_mcps = pl.len()
    )
)

detected_capsids

genome_method,ptn_method,genome_k,genome_r,ptn_k,ptn_r,genome_cluster,protein_cluster,known_mcps
str,str,i32,f64,i32,f64,u32,u32,u32
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,202,99,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,380,0,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,957,2,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,988,0,2
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,2129,0,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,2512,0,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,2859,0,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,2332,0,2
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,3411,0,1
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,850,3,2


In [12]:
per_cluster_proportion_summary = (
    cluster_summary
    .filter(
        pl.col("is_capsid") & (pl.col("vog_category") == "unknown")
    )
    .group_by(*group_cols, "genome_cluster", "protein_cluster")
    .agg(
        pl.first("cluster_size"),
        capsid_hits = pl.len(),
    )
    .join(detected_capsids, on=group_cols + ["genome_cluster", "protein_cluster"])
    .with_columns(
        prop = pl.col("capsid_hits") / (pl.col("cluster_size") - pl.col("known_mcps")),
        weight = pl.col("cluster_size") / pl.sum("cluster_size").over(group_cols)
    )
    .with_columns(
        weighted_prop = pl.col("prop") * pl.col("weight")
    )
)

per_cluster_proportion_summary

genome_method,ptn_method,genome_k,genome_r,ptn_k,ptn_r,genome_cluster,protein_cluster,cluster_size,capsid_hits,known_mcps,prop,weight,weighted_prop
str,str,i32,f64,i32,f64,u32,u32,u32,u32,u32,f64,f64,f64
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,3047,0,15,1,1,0.071429,0.000577,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,1899,1,19,1,1,0.055556,0.000731,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,20079,1,18,1,2,0.0625,0.000693,0.000043
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,1042,4,14,1,1,0.076923,0.000539,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,20467,5,19,1,1,0.055556,0.000731,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,21163,9,16,1,1,0.066667,0.000616,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,24446,0,10,2,1,0.222222,0.000385,0.000086
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,19388,11,14,1,1,0.076923,0.000539,0.000041
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,10876,8,22,1,1,0.047619,0.000847,0.00004
"""ctx-avg-large""","""esm-small""",15,1.0,5,0.1,24280,0,29,1,1,0.035714,0.001116,0.00004


Then, we use a weighted mean (weights = protein cluster size) to summarize for each genome clustering / protein clustering configuration:

In [13]:
proportion_summary = (
    per_cluster_proportion_summary
    .group_by(group_cols)
    .agg(
        pl.sum("weighted_prop", "cluster_size")
    )
    .rename({"weighted_prop": "prop"})
    .sort(["prop", "cluster_size"], descending=True)
)

proportion_summary

genome_method,ptn_method,genome_k,genome_r,ptn_k,ptn_r,prop,cluster_size
str,str,i32,f64,i32,f64,f64,u32
"""genslm""","""pst-small""",15,1.0,5,0.5,0.387778,3299
"""ctx-avg-large""","""pst-small""",15,1.0,5,0.5,0.376659,3062
"""esm2_t30_150M""","""pst-small""",15,1.0,5,0.5,0.374475,2921
"""pst-small""","""pst-small""",15,1.0,5,0.5,0.371701,2875
"""hyena-dna""","""pst-small""",15,1.0,5,0.5,0.370921,3202
"""kmer""","""genslm""",15,1.0,5,0.5,0.370719,4374
"""kmer""","""pst-small""",15,1.0,5,0.5,0.368512,2985
"""pst-large""","""pst-small""",15,1.0,5,0.5,0.36673,3173
"""esm2_t6_8M""","""pst-small""",15,1.0,5,0.5,0.363603,3146
"""genslm""","""genslm""",15,1.0,5,0.5,0.353726,5681


The above table was used to make **Extended Data Figure 9A**.

For **Figure 4C**, we further summarized this to focus on the differences between protein embeddings used for clustering, regardless of genome clustering:

In [14]:
(
    proportion_summary
    .group_by("ptn_method", "ptn_k", "ptn_r")
    .agg(
        pl.mean("prop")
    )
    .sort(["ptn_k", "ptn_r", "prop"], descending=[False, False, True])
)

ptn_method,ptn_k,ptn_r,prop
str,i32,f64,f64
"""pst-small""",5,0.1,0.141046
"""pst-large""",5,0.1,0.105428
"""genslm""",5,0.1,0.084609
"""esm-small""",5,0.1,0.074845
"""esm-large""",5,0.1,0.073542
"""pst-small""",5,0.5,0.370149
"""genslm""",5,0.5,0.298805
"""pst-large""",5,0.5,0.23857
"""esm-small""",5,0.5,0.235549
"""esm-large""",5,0.5,0.232024
