# Mapping of sequential to reference site numbers

In [None]:
import io
import os
import re
import subprocess
import tempfile

import Bio.SeqIO

import pandas as pd

import yaml

In [None]:
if not os.path.isfile("config.yaml"):
    os.chdir("../")  # if running interactively
    
with open("config.yaml") as f:
    config = yaml.safe_load(f)

In [None]:
ref = str(Bio.SeqIO.read(config["reference_gene"], "fasta").seq.translate(cds=True))

# get upper case portion
seq = Bio.SeqIO.read(config["extended_gene"], "fasta").seq
m = re.fullmatch("[acgt]*(?P<gene>[ACGT]+)[acgt]*", str(seq))
assert m, "not single upper case gene"
seq = str(seq[m.start("gene"): m.end("gene")].translate())
if "*" in seq[: -1]:
    raise ValueError(f"premature stop codons in {seq=}")

In [None]:
with tempfile.NamedTemporaryFile("w") as f:
    f.write(f">sequence\n{seq}\n>reference\n{ref}\n")
    f.flush()
    res = subprocess.run(["mafft", f.name], capture_output=True)
    
alignment = {
    s.id: str(s.seq)
    for s in Bio.SeqIO.parse(io.StringIO(res.stdout.decode("utf-8")), "fasta")
}

In [None]:
assert "-" not in alignment["reference"], "cannot handle insertions in sequence"

records = []
site = ref_site = 0
for aa, ref_aa in zip(alignment["sequence"], alignment["reference"]):
    assert ref_aa != "-"
    if aa == "-":
        ref_site += 1
    else:
        site += 1
        ref_site += 1
        records.append((site, ref_site, aa, ref_aa))
        
df = pd.DataFrame(
    records, columns=["sequential_site", "reference_site", "aa", "reference_aa"],
)

print("Here are mutated sites:")
display(df.query("aa != reference_aa").reset_index(drop=True))

df.to_csv(config["sequential_to_reference"], index=False)