Looking at the sars genome. We will download two files from GenBank.

In [None]:
import pathlib

import cogent3


def download_genbank_seq(accession: str, outpath: pathlib.Path):
    url = f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={accession}&rettype=gb&retmode=text"
    with cogent3.open_(url, mode="rb") as infile:
        data = infile.read()

    with cogent3.open_(outpath, mode="wb") as outfile:
        outfile.write(data)


alpha_path = pathlib.Path("alpha.gb.gz")
if not alpha_path.exists():
    download_genbank_seq("NC_045512.2", alpha_path)

alpha = cogent3.load_seq(alpha_path, moltype="dna")  # add support for new_type=True

show the genetic organisation of the sars genome

In [None]:
fig = alpha.get_drawable(
    biotype=("gene", "5'UTR", "3'UTR"),
)
fig.layout.title = "SARS-CoV-2 genome"
fig.layout |= dict(width=1000, height=300)
fig.show()

get the feature for the spike gene

In [None]:
feature = list(alpha.get_features(name="S", biotype="gene"))[0]
feature

get the sequence corresponding to that

In [None]:
alpha_spike = alpha[feature]

translate it into protein sequence

In [None]:
alpha_spike.get_translation()

In [None]:
omicron_path = pathlib.Path("omicron.gb.gz")
if not omicron_path.exists():
    download_genbank_seq("OR575624.1", omicron_path)

omicron = cogent3.load_seq(omicron_path, moltype="dna")
feature = list(omicron.get_features(name="S", biotype="gene"))[0]
omicron_spike = omicron[feature]
coll = cogent3.make_unaligned_seqs({"alpha": alpha, "omicron": omicron}, moltype="dna")
coll

In [None]:
spike_seqs = cogent3.make_unaligned_seqs(
    {"alpha": alpha_spike, "omicron": omicron_spike}, moltype="dna"
)

dp = spike_seqs.dotplot(window=20, threshold=20, k=10, title="Spike gene")
dp.remove_track(left_track=True)
dp.show(width=1000, height=1000)

downloading and getting ready to analyse the alignment of the sars genome

In [None]:
import pathlib

import cogent3


def get_sars_alignment(outpath):
    if outpath.exists():
        return cogent3.load_aligned_seqs(outpath, moltype="dna")

    url = "https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/public-latest.all.msa.fa.xz"
    aln = cogent3.load_aligned_seqs(
        url, format="fasta", label_to_name=lambda x: x.split("|")[-2], moltype="dna"
    )
    aln.write(outpath, format="fasta", block_size=100_000)
    return aln


def get_spike_coords_from_known(aln):
    seqid = "PP692424.1"
    selected = aln.take_seqs([seqid])
    selected = selected.to_type(array_align=False)
    # based on PP692424.1 GenBank record, plus discovering error in annotation
    selected.annotation_db.add_feature(
        seqid=seqid, biotype="gene", name="Spike", spans=[(21469, 25264)]
    )
    f = list(selected.get_features(seqid=seqid, name="Spike"))[0]
    # these are the start and end of the gene in alignment coordinates
    return f.map.start, f.map.end


def sars():
    import numpy

    aln_path = pathlib.Path("public-latest.all.msa.fa")
    aln = get_sars_alignment(aln_path)
    # s, e = get_spike_coords_from_known(aln)
    s, e = 21469, 25264
    spike = aln[s:e]
    names = numpy.array(spike.names)
    # select sequences that have < 1% missing data in S gene
    fnn = (spike.array_seqs > 4).sum(axis=1) / len(spike)
    names = names[fnn < 0.01]
    spike = spike.take_seqs(names.tolist())
    # removing redundant gaps
    spike = spike.omit_gap_pos(allowed_gap_frac=1 / len(names))
    return spike

In [None]:
# this step can take a few minutes on first run as it's cleaning up data
# reducing the alignment down to a smaller region after discovering this
# patch of coevolving sites
aln = sars()[1925:2050]

In [None]:
dmat = aln.coevolution(stat="rmi", drawable="heatmap", show_progress=True)

In [None]:
fig = dmat.drawable
fig.layout.title = "Clustered coevolving positions within SPIKE"
fig.show()