# Antibody escape in natural sequences

In [1]:
# Imports
import os
import yaml
import warnings
import scipy as sp
import pandas as pd
import altair as alt
from Bio import SeqIO, AlignIO 

# Suppress warnings
warnings.simplefilter("ignore")

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

In [2]:
# this cell is tagged as `parameters` for papermill parameterization
filtered_antibody_csv_dir = None

sequence_metadata = None
sequence_alignment = None

natural_diversity_outdir = None
antibody_escape_vs_nature_html = None

In [3]:
# # Uncomment for running interactive
# filtered_antibody_csv_dir = "../results/filtered_antibody_escape_CSVs/"

# sequence_metadata = "../non-pipeline_analyses/RABV_nextstrain/Results/metadata.tsv"
# sequence_alignment = "../non-pipeline_analyses/RABV_nextstrain/Results/Alignments/G_protein_alignment_ungapped.fasta"

# natural_diversity_outdir = "../results/natural_diversity_comparison/"
# antibody_escape_vs_nature_html = "../results/natural_diversity_comparison/antibody_escape_vs_natural_diversity.html"

Create a dataframe for amino acid level measurements and mutational frequencies with respect to the reference strain calculated from natural sequence alignments. 

In [4]:
def get_natural_sequence_counts(site, amino_acid, natural_seqs_df):
    """
    Function that counts occurences of an amino acid at a site
    across a dataframe of sequences.
    """
    count = 0
    for seq in natural_seqs_df["sequence"].tolist():
        # print(seq)
        # print(natural_seqs_df.at[i, "strain"])
        if seq[site-1] == amino_acid:
            count += 1
    return count

# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])
metadata_df = pd.read_csv(sequence_metadata, sep="\t")

# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(sequence_alignment, "fasta"):
    natural_seqs_df.loc[len(natural_seqs_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ] 

# Merge sequences and metadata
natural_seqs_df = (
    metadata_df.merge(
        natural_seqs_df,
        how="left",
        on=["strain"],
        validate="one_to_one",
    )
)

# Filter for only rabies sequences
natural_seqs_df = (
    natural_seqs_df
    .query("virus == 'Rabies lyssavirus' & sequence.notna()")
    .reset_index(drop=True)
)

# Get Reference sequence for comparison
ref_sequence = natural_seqs_df.loc[natural_seqs_df["strain"] == "NC_001542_2018-08-13"].at[0,"sequence"]

# Initialize mutant dataframe
mutation_count_df = pd.DataFrame(columns=["natural_sequence_site", "wildtype", "mutant"])
for i in range(len(ref_sequence)):
    site_list = [i+1] * 20
    wt_list = [ref_sequence[i]] * 20
    mutation_count_df = pd.concat([
        mutation_count_df,
        pd.DataFrame({
            "natural_sequence_site" : site_list, 
            "wildtype" : wt_list,
            "mutant" : ['R','K','H','D','E','Q','N','S','T','Y','W','F','A','I','L','M','V','G','P','C'],
        })
    ])
mutation_count_df["DMS reference"] = (
    mutation_count_df.apply(lambda x: True if x["wildtype"] == x["mutant"] else False, axis=1)
)

# Get natural sequence counts of each mutant and calculate mutation frequencies
# compared to the  reference
mutation_count_df["natural_counts"] = (
    mutation_count_df.apply(lambda x: get_natural_sequence_counts(x["natural_sequence_site"], x["mutant"], natural_seqs_df), axis=1)
)
number_sequences = mutation_count_df["natural_counts"].max()
mutation_count_df["mutation_frequency"] = (
    mutation_count_df.apply(lambda x: x["natural_counts"]/number_sequences if x["DMS reference"] == False else None, axis=1)
)

antibodies = [
    "RVC20",
    "RVA122",
    "17C7",
    "RVC58",
    "CR4098",
    "CR57",
    "CTB012",
    "RVC68",
]

# Add escape to dataframe for each antibody
for i,antibody in enumerate(antibodies):

    # Load data as dataframe
    escape_df = pd.read_csv(filtered_antibody_csv_dir+antibody+"_filtered_mut_effects.csv")
    
    # Rename escape column to include antibody name
    escape_df = escape_df.rename(columns={"floored_escape" : "floored_escape_" + antibody})

    # Merge dataframes
    if i == 0:
        mutation_count_df = (
            mutation_count_df.merge(
                escape_df[["natural_sequence_site", "site", "sequential_site", "wildtype", "mutant", "floored_escape_" + antibody]],
                how="left",
                on=["natural_sequence_site", "wildtype", "mutant"],
                validate="one_to_one",
            )
        )
    else:
        mutation_count_df = (
            mutation_count_df.merge(
                escape_df[["natural_sequence_site", "wildtype", "mutant", "floored_escape_" + antibody]],
                how="left",
                on=["natural_sequence_site", "wildtype", "mutant"],
                validate="one_to_one",
            )
        )

# Replace 0 with small value to display on log scale
mutation_count_df["mutation_frequency"] = mutation_count_df["mutation_frequency"].replace(0, 0.00001) # smallest value ~0.0002

In [5]:
subplots = []
for antibody in antibodies:

    curr_subplot = alt.Chart(mutation_count_df, title=antibody).mark_point(
        filled=True, 
        color="black", 
        size=75,
        opacity=0.15,
    ).encode(
        alt.X(
            "mutation_frequency",
            axis=alt.Axis(
                title="mutation frequency", 
                # values=[0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
                # format="0.3",
            ),
            scale=alt.Scale(type="log")
        ),
        alt.Y(
            "floored_escape_" + antibody,
            axis=alt.Axis(
                title="escape", 
                # values=[0,1,2,3,4,5,6],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            # scale=alt.Scale(domain=[0,6.1])
        ),
        tooltip=[
            "site", 
            "sequential_site",
            "natural_sequence_site",
            "wildtype",
            "mutant",
            "mutation_frequency",
            "floored_escape_" + antibody,
        ], 
    ).properties(
        width=250,
        height=150,
    )
    
    subplots.append(curr_subplot)

row_1 = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],  
    spacing=10,
)

row_2 = alt.hconcat(
    subplots[4],
    subplots[5],
    subplots[6],
    subplots[7],  
    spacing=10,
)

natural_vs_antibody = alt.vconcat(
    row_1,
    row_2,
    spacing=10,
    title=["Mutational frequencies vs antibody escape", "for individual amino-acid mutations"],
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

# Make output dir if doesn't exist
if not os.path.exists(natural_diversity_outdir):
    os.mkdir(natural_diversity_outdir)

print(f"Saving to {antibody_escape_vs_nature_html}")
natural_vs_antibody.save(antibody_escape_vs_nature_html)

natural_vs_antibody

Saving to ../results/natural_diversity_comparison/antibody_escape_vs_natural_diversity.html
