In [1]:
import bionumpy as bnp
from bionumpy.bnpdataclass import BNPDataClass
import numpy as np
import re
from tempfile import NamedTemporaryFile
from fastatools import FastaFile
from contextlib import contextmanager

# need to mask a warning from numpy that bionumpy should handle
# if you follow the versions used here, this should all work...
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from typing import TypeVar
T = TypeVar("T", bound=BNPDataClass)

In [2]:
# package versions
%load_ext watermark
%watermark -vp bionumpy,numpy

Python implementation: CPython
Python version       : 3.11.3
IPython version      : 8.13.2

bionumpy: 1.0.8
numpy   : 1.23.5



`fastatools` is a simple FASTA-parsing library I wrote that you can install with `pip install pyfastatools`

In [3]:
# bionumpy only works with ACGT characters
@contextmanager
def remove_ambiguous_chars(file: str):
    pattern = re.compile(r"N|W|K|R|S|B|D|H|Y|M|V")
    try:
        tmpfile = NamedTemporaryFile("w+", dir=".", suffix=".fna")
        for record in FastaFile(file).parse():
            record.sequence = pattern.sub("", record.sequence)
            record.write(tmpfile) # type: ignore
            
        tmpfile.seek(0)

        yield FastaFile(tmpfile.name)        
    finally:
        tmpfile.close()

In [4]:
@bnp.streamable(np.vstack)
def kmers(chunk: T, k: int = 4):
    seqs = bnp.change_encoding(chunk.sequence, bnp.DNAEncoding)
    counts = bnp.count_encoded(bnp.get_kmers(seqs, k=k))
    return counts.counts / counts.counts.sum(-1, keepdims=True)

This is the main function to use, taking as input a genome FASTA file

In [5]:
# results come back in the order of the input files
def process(file: str, is_vmag: bool = False) -> np.ndarray:
    with remove_ambiguous_chars(file) as fasta:
        stream = (
            bnp.open(
                fasta._file,
                buffer_type=bnp.io.MultiLineFastaBuffer
            )
            .read_chunks()
        )

        kmer_freq = kmers(stream, k=4).astype(np.float32)

        # if input file is for a single vMAG, then all sequences are fragments of the same genome, so we should average them
        if is_vmag:
            kmer_freq = kmer_freq.mean(0, keepdims=True)

    return kmer_freq

This genome FASTA file is not being released since these are publicly available genomes described in `Supplementary Table 1`.

In [6]:
process("datasets/genomes/training_viruses.fna", False)

array([[0.00397601, 0.00411079, 0.00380753, ..., 0.00360536, 0.00566076,
        0.00532381],
       [0.00587671, 0.00493573, 0.00594773, ..., 0.00582345, 0.00447411,
        0.00722605],
       [0.01522759, 0.0102066 , 0.01514528, ..., 0.00847806, 0.00263396,
        0.00864269],
       ...,
       [0.00316154, 0.00435881, 0.00621083, ..., 0.00319895, 0.00278739,
        0.00246937],
       [0.01490281, 0.00539957, 0.00453564, ..., 0.01166307, 0.00475162,
        0.01555076],
       [0.01008002, 0.00592447, 0.00572792, ..., 0.00769339, 0.00746876,
        0.01142777]], dtype=float32)

The results from the above function can be stored in a variety of ways, but the most common would be `.h5` files (using the Python `hdf5` or `pytables` libraries).

The benefit of `.h5` files is that the outputs from all genomes can be stacked into a single matrix for storage. You can create a separate list to keep track of the order of genomes. 