### Calculate minimum mutation distance from wildtype
Read in antibody escape data and unmutated reference sequence and calculate minimum nucleotide mutations required to change to each amino acid.

In [None]:
import pandas as pd
import altair as alt
import httpimport
from Bio.Seq import Seq
from Bio.Data import CodonTable
from itertools import product
import numpy as np


_ = alt.data_transformers.disable_max_rows()

In [None]:
# Import custom altair theme from remote github using httpimport module
def import_theme_new():
    with httpimport.github_repo("bblarsen-sci", "altair_themes", "main"):
        import main_theme

        @alt.theme.register("custom_theme", enable=True)
        def custom_theme():
            return main_theme.main_theme()


import_theme_new()

In [None]:
# read in escape data
escape_df = pd.read_csv(snakemake.input.escape_df)
display(escape_df)

In [None]:
### Calculator for minimum mutations between amino acids
def hamming_distance(seq1, seq2):
    """Calculate Hamming distance between two sequences of equal length."""
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))


def get_all_codons_for_aa(amino_acid, codon_table=1):
    """Get all possible codons that code for a given amino acid."""
    table = CodonTable.unambiguous_dna_by_id[codon_table]
    codons = []
    for codon, aa in table.forward_table.items():
        if aa == amino_acid:
            codons.append(codon)

    # Handle stop codons
    if amino_acid == "*":
        codons = list(table.stop_codons)

    return codons


def min_mutations_between_amino_acids(aa1, aa2, codon_table=1):
    """
    Calculate minimum number of nucleotide mutations needed to change
    from amino acid aa1 to amino acid aa2.
    """
    if aa1 == aa2:
        return 0

    # Get all possible codons for each amino acid
    codons1 = get_all_codons_for_aa(aa1, codon_table)
    codons2 = get_all_codons_for_aa(aa2, codon_table)

    if not codons1 or not codons2:
        return np.nan  # Invalid amino acid

    # Find minimum distance between any pair of codons
    min_distance = float("inf")
    for c1 in codons1:
        for c2 in codons2:
            distance = hamming_distance(c1, c2)
            min_distance = min(min_distance, distance)

    return min_distance


def calculate_mutation_distances(df, nucleotide_sequence=None):
    """
    Add a column with minimum mutation distances to the dataframe.

    Parameters:
    df: pandas DataFrame with columns 'site', 'wildtype', 'mutant'
    nucleotide_sequence: optional nucleotide sequence (not used in basic calculation)

    Returns:
    DataFrame with added 'min_mutations' column
    """
    df = df.copy()

    # Calculate minimum mutations for each row
    df["min_mutations"] = df.apply(
        lambda row: min_mutations_between_amino_acids(row["wildtype"], row["mutant"]),
        axis=1,
    )

    return df


def get_actual_codon_at_site(nucleotide_sequence, site):
    """
    Extract the actual codon at a specific amino acid site from nucleotide sequence.
    Site numbering starts at 1.
    """
    if nucleotide_sequence is None:
        return None

    # Convert to 0-based indexing and get the codon
    codon_start = (site - 1) * 3
    codon_end = codon_start + 3

    if codon_end > len(nucleotide_sequence):
        return None

    return nucleotide_sequence[codon_start:codon_end]


def calculate_mutation_distances_with_sequence(df, nucleotide_sequence):
    """
    Calculate mutation distances using the actual codons from the nucleotide sequence.
    """
    df = df.copy()

    def get_mutations_for_row(row):
        site = row["site"]
        wildtype_aa = row["wildtype"]
        mutant_aa = row["mutant"]

        if wildtype_aa == mutant_aa:
            return 0

        # Get the actual codon at this site
        actual_codon = get_actual_codon_at_site(nucleotide_sequence, site)

        if actual_codon is None:
            print(f"Warning: No codon found for site {site} in the sequence.")
            

        # Verify the actual codon codes for the wildtype amino acid
        actual_aa = str(Seq(actual_codon).translate())
        if actual_aa != wildtype_aa:
            print(
                f"Warning: Codon {actual_codon} at site {site} codes for {actual_aa}, not {wildtype_aa}"
            )

        # Get all possible codons for the mutant amino acid
        mutant_codons = get_all_codons_for_aa(mutant_aa)
        # Find minimum distance from actual codon to any mutant codon
        min_distance = float("inf")
        for mutant_codon in mutant_codons:
            distance = hamming_distance(actual_codon, mutant_codon)
            min_distance = min(min_distance, distance)

        return min_distance

    df["min_mutations"] = df.apply(get_mutations_for_row, axis=1)
    return df


def load_sequence_from_fasta(file_path):
    """
    Load nucleotide sequence from FASTA file.
    Returns the first sequence found in the file.
    """
    from Bio import SeqIO

    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, "fasta"):
            return str(record.seq)
    return None


# From FASTA file
try:
    wildtype_sequence = load_sequence_from_fasta(snakemake.input.wildtype_sequence)
    pass
except: 
    print("FASTA file not found")

print(f"Loaded sequence: {wildtype_sequence}")
print(f"Sequence length: {len(wildtype_sequence)} nucleotides")
print(f"Codes for {len(wildtype_sequence) // 3} amino acids")
print()

# Calculate mutations with your sequence
df_with_sequence = calculate_mutation_distances_with_sequence(escape_df, wildtype_sequence)
print("Results with your wildtype sequence:")
display(df_with_sequence.head(20))

# Verify the sequence codes for your expected amino acids
print("\nSequence verification:")
translated = str(Seq(wildtype_sequence).translate())
print(f"Your sequence translates to: {translated}")


In [None]:
# Save the results
df_with_sequence.to_csv(snakemake.output.min_mutation_distance, index=False)