In [1]:
import os
import itertools as it
from functools import partial
os.environ["POLARS_MAX_THREADS"] = "128"
import polars as pl

In [2]:
%load_ext watermark
%watermark -vp polars

Python implementation: CPython
Python version       : 3.11.3
IPython version      : 8.13.2

polars: 0.20.6



In [3]:
functional_categories = {
    "structural": [
        "head",
        "capsid",
        "tail",
        "baseplate",
        "fiber",
        "sheath",
        "collar",
        "neck",
        "spike",
        "plate",
        "tube",
        "connector",
        "polyprotein",
        "scaffold",
        "interna",
        "coat",
        "structur",
        "inner",
        "assembl",
        "adsorp",
        "attach",
        "envelop",
        "tape",
        "virion",
    ],
    "packaging": [
        "terminase",
        "portal",
        "packag",
    ],
    "exit": [
        "lysis",
        "holin",
        "spanin",
        "Rz",
        "lysin",
        "lysozyme",
        "peptidoglyc",
        "muram",
        "Virion export",
        "budd",
    ],
    "integration": [
        "integra",
        "transpos",
        "excision",
    ],
    "gene expression": [
        "DNA-directed RNA pol",
        "sigma",
        "tRNA",
        "RNA[- ]binding",
        "mRNA",
        "RNA[- ](lig|2'-phospho|silenc|helic|mediat|splic)",
        "RNAP",
        "transcrip",
        "transla",
    ],
    "anti-host defense": [
        "methyl",
        "restri",
        "modif",
        "CRISPR",
        "RNA-guided DNA endonuclease",
        "apoptos",
        "rII[AB $]",
    ],
    "replication": [
        "DNA",
        "repl",
        "RNA[- ]dep",
        "RNA[- ]directed",
        "polyprotein",
        "nucl",
        "d[ATCG]TP",
        "thymid",
        "Phosphoribosylformylglycinamidine",
        "puri",
        "pyrimid",
    ],
    "unknown": [
        "hypoth",
        "uncharacter",
        "unclass",
        "unident",
        "unknown",
        "DUF",
        "REFSEQ protein",
    ],
}

col = "description"

def _combine(col: str, cats: list[str]) -> pl.Expr:
    combined = "|".join(cats)
    return pl.col(col).str.contains(f"(?i){combined}")

combine = partial(_combine, col=col)

function_expr = (
    pl.when(combine(cats=functional_categories["structural"]))
    .then(pl.lit("structural"))
)

for cat, values in it.islice(functional_categories.items(), 1, None):
    function_expr = (
        function_expr
        .when(combine(cats=values))
        .then(pl.lit(cat))
    )

function_expr = (
    function_expr
    .when(combine(cats=["RNA pol"]))
    .then(pl.lit("gene expression"))
    .otherwise(pl.lit("other"))
)

In [4]:
# this is for VOG r219
vog_metadata = (
    pl.read_csv("vog.annotations.tsv", separator="\t")
    .rename(
        {
            "#GroupName": "hmm",
            "ProteinCount": "n_ptns",
            "SpeciesCount": "n_species",
            "FunctionalCategory": "category",
            "ConsensusFunctionalDescription": "description",
        }
    )
    .with_columns(
        function = function_expr,
    )
)

with pl.Config(fmt_str_lengths=200):
    display(vog_metadata)

hmm,n_ptns,n_species,category,description,function
str,i64,i64,str,str,str
"""VOG00001""",1531,42,"""Xh""","""sp|Q5UQJ2|YR863_MIMIV Putative ankyrin repeat protein R863""","""other"""
"""VOG00002""",1272,995,"""Xu""","""sp|Q5UR40|YR559_MIMIV Uncharacterized HTH-type transcriptional regulator R559""","""gene expression"""
"""VOG00003""",332,69,"""Xh""","""sp|Q5UPJ9|YL122_MIMIV Putative ankyrin repeat protein L122""","""other"""
"""VOG00004""",1037,83,"""Xh""","""sp|Q5UPF8|YL088_MIMIV Putative ankyrin repeat protein L88""","""other"""
"""VOG00005""",213,42,"""Xu""","""sp|Q91FD6|388R_IIV6 Putative MSV199 domain-containing protein 388R""","""other"""
"""VOG00006""",309,13,"""Xu""","""sp|Q5UQL2|YL414_MIMIV Putative F-box and FNIP repeat-containing protein L414""","""other"""
"""VOG00007""",893,715,"""XhXrXs""","""sp|P17766|POLG_PPVNA Genome polyprotein""","""structural"""
"""VOG00008""",1353,1040,"""Xr""","""sp|O22001|VXIS_BPMD2 Excisionase""","""integration"""
"""VOG00009""",145,107,"""Xu""","""sp|Q5UPT4|YL247_MIMIV Uncharacterized HNH endonuclease L247""","""replication"""
"""VOG00010""",269,75,"""Xu""","""REFSEQ 2OG-Fe(II) oxygenase""","""other"""


This table can be found in the PST manuscript as **Supplementary Table 6**