# Mapping of sequential to reference site numbers

In [1]:
import io
import os
import re
import subprocess
import tempfile

import Bio.SeqIO

import pandas as pd

import yaml

In [2]:
if not os.path.isfile("config.yaml"):
    os.chdir("../")  # if running interactively
    
with open("config.yaml") as f:
    config = yaml.safe_load(f)

In [3]:
ref = str(Bio.SeqIO.read(config["reference_gene"], "fasta").seq.translate(cds=True))

# get upper case portion
seq = Bio.SeqIO.read(config["extended_gene"], "fasta").seq
m = re.fullmatch("[acgt]*(?P<gene>[ACGT]+)[acgt]*", str(seq))
assert m, "not single upper case gene"
seq = str(seq[m.start("gene"): m.end("gene")].translate())
if "*" in seq[: -1]:
    raise ValueError(f"premature stop codons in {seq=}")

In [4]:
with tempfile.NamedTemporaryFile("w") as f:
    f.write(f">sequence\n{seq}\n>reference\n{ref}\n")
    f.flush()
    res = subprocess.run(["mafft", f.name], capture_output=True)
    
alignment = {
    s.id: str(s.seq)
    for s in Bio.SeqIO.parse(io.StringIO(res.stdout.decode("utf-8")), "fasta")
}

In [6]:
assert "-" not in alignment["reference"], "cannot handle insertions in sequence"

records = []
site = ref_site = 0
for aa, ref_aa in zip(alignment["sequence"], alignment["reference"]):
    assert ref_aa != "-"
    if aa == "-":
        ref_site += 1
    else:
        site += 1
        ref_site += 1
        records.append((site, ref_site, aa, ref_aa))
        
df = pd.DataFrame(
    records, columns=["sequential_site", "reference_site", "aa", "reference_aa"],
)

print("Here are mutated sites:")
display(df.query("aa != reference_aa").reset_index(drop=True))

df.to_csv(config["sequential_to_reference"], index=False)

AssertionError: 

In [13]:
records = []
site = ref_site = 0
consecutive_dash_count = 0

for aa, ref_aa in zip(alignment["sequence"], alignment["reference"]):
    if aa == "-":
        consecutive_dash_count += 1
        ref_site_with_letter = f"{ref_site}{chr(ord('a') + consecutive_dash_count - 1)}"
        records.append((site, ref_site_with_letter, aa, ref_aa))
    else:
        site += 1
        ref_site += 1
        consecutive_dash_count = 0  # Reset consecutive dash count
        records.append((site, ref_site, aa, ref_aa))

df = pd.DataFrame(
    records, columns=["sequential_site", "reference_site", "aa", "reference_aa"],
)

print("Here are mutated sites:")
display(df.query("aa != reference_aa").reset_index(drop=True))

df.to_csv(config["sequential_to_reference"], index=False)

Here are mutated sites:


Unnamed: 0,sequential_site,reference_site,aa,reference_aa
0,17,17,M,-
1,18,18,P,-
2,19,19,L,-
3,20,20,F,-
4,23,23,I,T
...,...,...,...,...
80,1248,1248q,-,K
81,1248,1248r,-,L
82,1248,1248s,-,H
83,1248,1248t,-,Y


In [7]:
alignment

{'sequence': 'MFVFLVLLPLVSSQCVMPLFNLITTTQ---SYTNSFTRGVYYPDKVFRSSVLHLTQDLFLPFFSNVTWFHAI--SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVFIKVCEFQFCNDPFLDV-YHKNNKSWMESESGVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI-IGRDFPQGFSALEPLVDLPIGINITRFQTLLALNRSYLTPGDSSSGWTAGAADYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNVTNLCPFHEVFNATRFASVYAWNRTRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIKGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKHSGNYDYWYRSFRKSKLKPFERDISTEIYQAGNKPCKG-KGPNCYFPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTKSNKKFLPFQQFGRDIVDTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVSVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSRRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLFSTASALGKLQDVVNHNAQALNTLVKQLSSKFGAISSVLNDILS

In [11]:
import csv

# Input sequences
sequence = 'MFVFLVLLPLVSSQCVMPLFNLITTTQ---SYTNSFTRGVYYPDKVFRSSVLHLTQDLFLPFFSNVTWFHAI--SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVFIKVCEFQFCNDPFLDV-YHKNNKSWMESESGVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI-IGRDFPQGFSALEPLVDLPIGINITRFQTLLALNRSYLTPGDSSSGWTAGAADYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNVTNLCPFHEVFNATRFASVYAWNRTRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIKGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKHSGNYDYWYRSFRKSKLKPFERDISTEIYQAGNKPCKG-KGPNCYFPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTKSNKKFLPFQQFGRDIVDTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQGVNCTEVSVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEYVNNSYECDIPIGAGICASYQTQTKSRRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLKRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKYFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLFSTASALGKLQDVVNHNAQALNTLVKQLSSKFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQLELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGS---------------------'
reference = 'MFVFLVLLPLVSSQCV----NLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'

# Initialize lists to store data
sequential_site = []
reference_site = []
aa = []
reference_aa = []

# Initialize counters for sequential numbering and reference numbering
sequential_number = 1
reference_number = 1
consecutive_dash_count = 0  # Track consecutive dashes

# Iterate through the sequences
for s, r in zip(sequence, reference):
    # Skip '-' characters in both sequences
    if s != '-':
        # Append sequential site and amino acid
        sequential_site.append(sequential_number)
        aa.append(s)
        sequential_number += 1
        consecutive_dash_count = 0  # Reset consecutive dash count
    if r != '-':
        # Append reference site and reference amino acid
        reference_site.append(reference_number)
        reference_aa.append(r)
        reference_number += 1
        consecutive_dash_count = 0  # Reset consecutive dash count
    elif r == '-':
        # Append reference site with sequential alphabetical letter
        consecutive_dash_count += 1
        reference_site.append(f'{reference_number - consecutive_dash_count}{chr(ord("a") + consecutive_dash_count - 1)}')

# Create a list of dictionaries for CSV writing
data = [
    {
        "sequential_site": seq_site,
        "reference_site": ref_site,
        "aa": amino_acid,
        "reference_aa": ref_amino_acid
    }
    for seq_site, ref_site, amino_acid, ref_amino_acid in zip(sequential_site, reference_site, aa, reference_aa)
]

# Write the data to a CSV file
csv_filename = "sequence_alignment.csv"
with open(csv_filename, mode='w', newline='') as csv_file:
    fieldnames = ["sequential_site", "reference_site", "aa", "reference_aa"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the data
    writer.writerows(data)

print(f"CSV file '{csv_filename}' has been created successfully.")


CSV file 'sequence_alignment.csv' has been created successfully.
