# Compare DMS data to natural sequence data

This notebook analyzes natural sequences for antibody escape predicted by DMS and shows the resulting neutralization validations. Additional analysis is performed by correlating natural sequence diversity with DMS data.

In [None]:
# Imports
import os
import warnings
import neutcurve
import numpy as np
import scipy as sp
import pandas as pd
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO, AlignIO 

# Plotting colors
tol_muted_adjusted = [
    "#000000",
    "#CC6677", 
    "#1f78b4", 
    "#DDCC77", 
    "#117733", 
    "#882255", 
    "#88CCEE",
    "#44AA99", 
    "#999933", 
    "#AA4499", 
    "#EE7733",
    "#CC3311",
    "#DDDDDD",
]

# Seaborn style settings
sns.set(rc={"figure.dpi":300, "savefig.dpi":300})
sns.set_style("ticks")
sns.set_palette(tol_muted_adjusted)

# Suppress warnings
warnings.simplefilter("ignore")

In [None]:
# this cell is tagged as `parameters` for papermill parameterization
filtered_escape_377H = None
filtered_escape_89F = None
filtered_escape_2510C = None
filtered_escape_121F = None
filtered_escape_256A = None
filtered_escape_372D = None

func_scores = None
min_times_seen = None
n_selections = None

natural_sequence_variation = None
natural_GPC_sequence_alignment = None

fraction_infected_natural_isolates = None
out_dir = None
neuts_image_path = None

In [None]:
# # Uncomment for running interactive
# filtered_escape_377H = "../results/filtered_antibody_escape_CSVs/377H_filtered_mut_effect.csv"
# filtered_escape_89F = "../results/filtered_antibody_escape_CSVs/89F_filtered_mut_effect.csv"
# filtered_escape_2510C = "../results/filtered_antibody_escape_CSVs/2510C_filtered_mut_effect.csv"
# filtered_escape_121F = "../results/filtered_antibody_escape_CSVs/121F_filtered_mut_effect.csv"
# filtered_escape_256A = "../results/filtered_antibody_escape_CSVs/256A_filtered_mut_effect.csv"
# filtered_escape_372D = "../results/filtered_antibody_escape_CSVs/372D_filtered_mut_effect.csv"

# func_scores = "../results/func_effects/averages/293T_entry_func_effects.csv"
# min_times_seen = 2
# n_selections = 8

# natural_sequence_variation = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/GPC_protein_variation.csv"
# natural_GPC_sequence_alignment = "../non-pipeline_analyses/LASV_phylogeny_analysis/Results/LASV_GPC_protein_alignment.fasta"

# fraction_infected_natural_isolates = "../data/validation_frac_infected_natural_isolates.csv"
# out_dir = "../results/validation_plots/"
# neuts_image_path = "../results/validation_plots/validation_neut_curves_natural_isolates.svg"

## Identify natural sequences that would potentially escape antibody neutralization

To identify any natural isolates that could potentially escape any of the mapped antibodies, we want to identify high confidence escape mutations. First, we are going to filter for the top 5% escape mutants and then further filter this list to the mutations present in the top 5% of **summed escape** sites. Escape scores are clipped at the lower end of 0 to focus on escape mutations rather than sensitizing mutations. Any sequence with these mutations will be flagged as potential escape isolates.

In [None]:
def determine_escape(percentile_escape, sequence, escape_file, strain, print_results=False):
    """
    Function that determines if a sequence contains any 
    escape mutations given a percentile cutoff.
    """

    antibody_name = escape_file.split("/")[-1].split("_")[0]
    
    # Load data as dataframe
    escape_df = pd.read_csv(escape_file)

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Get muts for top escape
    cutoff = escape_df["escape_median"].quantile(percentile_escape)
    top_escape_muts = (
        tuple(zip(
            escape_df.loc[escape_df["escape_median"] >= cutoff]["mutation"].tolist(),
            escape_df.loc[escape_df["escape_median"] >= cutoff]["escape_median"].tolist(),
        ))
    )

    # Calculate summed escape sites and get sites for top escape
    escape_df = (
        escape_df
        .groupby("site")
        .aggregate({"escape_median" : "sum"})
        .reset_index()
    )
    cutoff = escape_df["escape_median"].quantile(percentile_escape)
    top_escape_sites = escape_df.loc[escape_df["escape_median"] >= cutoff]["site"].tolist()

    # Filter top escape muts bases on top escape sites
    top_escape_muts = [x for x in top_escape_muts if int(x[0][1:-1]) in top_escape_sites]
    
    # Initialize escape flag
    escape = 0
    
    # Iterate through list of escape mutations
    for escape_mut in top_escape_muts:
        site = int(escape_mut[0][1:-1])
        if sequence[site-1] == escape_mut[0][-1]:
            if print_results:
                print(f"{strain:<75} with \t {escape_mut[0][0]}{site}{escape_mut[0][-1]} \t DMS score: {escape_mut[1]}")
            escape += escape_mut[1]
   
    return escape

# Load alignment and metadata info
natural_seqs_df = pd.DataFrame(columns=["strain", "sequence"])

# Add alignment sequence to dataframe
for curr_fasta in AlignIO.read(natural_GPC_sequence_alignment, "fasta"):
    natural_seqs_df.loc[len(natural_seqs_df.index)] = [
        str(curr_fasta.id),
        str(curr_fasta.seq),
    ] 

# Add antibody escape columns
natural_seqs_df["377H_total_escape"] = 0
natural_seqs_df["89F_total_escape"] = 0
natural_seqs_df["2510C_total_escape"] = 0
natural_seqs_df["121F_total_escape"] = 0
natural_seqs_df["256A_total_escape"] = 0
natural_seqs_df["372D_total_escape"] = 0

# Antibody escape dataframes and percentile cutoffs to use
antibody_files = [
    (filtered_escape_2510C, 0.95),
    (filtered_escape_121F, 0.95), 
    (filtered_escape_377H, 0.95),
    (filtered_escape_256A, 0.95),
    (filtered_escape_372D, 0.95),
    (filtered_escape_89F, 0.95), 
]

for antibody, percentile in antibody_files:
    antibody_name = antibody.split("/")[-1].split("_")[0]
    print(f"{antibody_name} potentially escaped by:")
    natural_seqs_df[antibody_name + "_total_escape"] = (
        natural_seqs_df.apply(lambda x: determine_escape(
            percentile, 
            x["sequence"], 
            antibody, 
            x["strain"],
            print_results=True,
        ), axis=1)
    )
    print()

Four isolates were chosen from the above list based on the strongest escape mutations. We further calculate cumulative summed escape scores across all mutations present in each isolate even if the effects are small. However, this simple additive estimate probably does not correctly reflect the true impact of the combined mutations present in each isolate.

In [None]:
# Isolates chosen to validate for escape 
chosen_isolates = [
    "GA391_OL774861_reverse_complement_1977-XX-XX",
    "LASV_H-sapiens-tc_NGA_2016_IRR_007_MK107922_2016-01-18",
    "Lassa_virus_H-sapiens-wt_NGA_2018_ISTH_1024_MH157037_2018-02-14",
    "LM395-SLE-2009_KM822115_2009-XX-XX",
]

# Create subset of df for chosen isolates
validation_isolates = (
    natural_seqs_df.loc[natural_seqs_df["strain"].isin(chosen_isolates)]
    .reset_index(drop=True)
)

# Calculate total escape for all mutations for
# each chosen isolate by adding all escape mutations
for antibody, _ in antibody_files:
    antibody_name = antibody.split("/")[-1].split("_")[0]
    validation_isolates[antibody_name + "_total_escape"] = (
        validation_isolates.apply(lambda x: determine_escape(
            0, 
            x["sequence"], 
            antibody, 
            x["strain"],
        ), axis=1)
    )

## Pseudovirus neutralization validation assays for chosen natural isolates and corresponding single mutants

The following were chosen for validation:
- 8.9F
    - Natural isolate: Lassa_virus_H-sapiens-wt_NGA_2018_ISTH_1024_MH157037_2018-02-14
        - Corresponding single mutant: K126N (DMS score: 0.7906)
- 12.1F
    - Natural isolate: LM395-SLE-2009_KM822115_2009-XX-XX
        - Corresponding single mutant: N89D (DMS score: 2.515)
- 25.10C
    - Natural isolate: GA391_OL774861_reverse_complement_1977-XX-XX 
        - Corresponding single mutant: E228D (DMS score: 3.098)
- 37.7H
    - Natural isolate: GA391_OL774861_reverse_complement_1977-XX-XX 
        - Corresponding single mutant: H398K (DMS score: 2.949)
    - Natural isolate: LASV_H-sapiens-tc_NGA_2016_IRR_007_MK107922_2016-01-18
        - Corresponding single mutant: D401E (DMS score: 1.3)
- 37.2D
    - Natural isolate: GA391_OL774861_reverse_complement_1977-XX-XX
        - Corresponding single mutant: H398K (DMS score: 1.078)

*The D401E mutation present in the chosen natural isolates did not have high confidence in the DMS data for the 25.6A antibody selection so 25.6A will be omitted from validation neutralization assays. 

In [None]:
# Read nuetralization data
frac_infected_natural_isolates = pd.read_csv(fraction_infected_natural_isolates)

# Rename viruses 
rename_dict = {
    "WT" : "unmutated",
}

frac_infected_natural_isolates["virus"] = (
    frac_infected_natural_isolates["virus"].replace(rename_dict)
)

# Markers
markers = [
    "o",
    "o",
    "o",
    "o",
    "o",
    "o",
    "o",
    "o",
    "o",
    "o",
]

# Fit hill curves using neutcurve
fits = neutcurve.curvefits.CurveFits(
    data=frac_infected_natural_isolates,
    fixbottom=0,
    fixtop=1,
)

# IC values to calculate
fitParams = fits.fitParams(ics=[50, 80, 90, 95, 97, 98, 99])

fig, axes = fits.plotSera(
    ncol=1,
    markers=markers,
    colors=tol_muted_adjusted,
    attempt_shared_legend=False,
    sharex=False,
    sharey=False,
    xlabel="",
    ylabel="",
)

antibody_names = [
    "8.9F",
    "12.1F",
    "25.10C",
    "37.7H",
    "37.2D",
]

for index in range(5):
    axes[index,0].set_title(
        antibody_names[index], 
        weight="bold",
        fontsize=8,
    )
    axes[index,0].set_xlabel(
        "concentration (\u03BCg/mL)", 
        fontsize=8,
        # weight="bold",
    )
    axes[index,0].set_ylabel(
        "fraction infectivity", 
        fontsize=8,
        # weight="bold",
    )
    axes[index,0].set_ylim(-0.1, 1.3)
    axes[index,0].set_yticks([0, 0.5, 1.0])
    axes[index,0].set_yticklabels(labels=[0, 0.5, 1.0], fontsize=8)
    axes[index,0].set_xlim(0.0005, 15)
    axes[index,0].set_xticks([0.001, 0.01, 0.1, 1, 10])
    axes[index,0].set_xticklabels(labels=["$10^{-3}$", "$10^{-2}$", "$10^{-1}$", "$10^0$", "$10^1$"], fontsize=8)
    plt.setp(axes[index,0].collections, alpha=0.8, linewidths=0.5, colors="black") # for vertical error bar segment
    plt.setp(axes[index,0].lines, alpha=0.8, markeredgewidth=0.5, markeredgecolor="black", linewidth=1) # for the lines and markers
    sns.move_legend(
        axes[index,0], 
        loc="upper left",
        borderaxespad=0,
        frameon=False,
        bbox_to_anchor=(1, 1),
        fontsize=8,
        markerscale=1,
        handletextpad=0.1,
        title="amino acid\nsubstitutions",
        title_fontproperties={"weight" : "bold", "size" : 8},
        alignment="left"
    )

    # Add edges to legend markers to match scatter plot
    for ha in axes[index,0].legend_.legendHandles:
        ha.set_markeredgecolor("black")
        ha.set_markeredgewidth(0.5)
        ha.set_linewidth(0)
        
    # Change all spines
    for axis in ["top", "bottom", "left", "right"]:
        axes[index,0].spines[axis].set_linewidth(1)
    axes[index,0].tick_params(axis="both", length=4, width=1)

width = 2.5
height = 10
fig.set_size_inches(width, height)

# Make output dir if doesn't exist
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Save fig
plt.savefig(neuts_image_path)

## Compare natural diversity to functional scores

Next, we are going to compare the DMS data to natural sequence diversity. In this case, natural sequence diversity is reflected by calculating effective amino acids* at each site based on all high quality available Lassa GPC sequences.

*The formula for calculating effective amino acids is described in *Biophysical Models of Protein Evolution: Understanding the Patterns of Evolutionary Sequence Divergence*

In [None]:
# Load data as dataframe
functional_scores = pd.read_csv(func_scores)

# Filter for minimum selections, times seen and no stop codons
functional_scores = (
    functional_scores.query(
        "n_selections >= @n_selections and times_seen >= @min_times_seen and mutant != '*'"
    )
    .drop(columns=["mutant", "times_seen"])
    .groupby(["site", "wildtype"])
    .aggregate({
        "effect" : "mean"
    })
    .reset_index()
)

# Load data as dataframe
natural_variation = pd.read_csv(natural_sequence_variation)

# Drop individual amino acid counts
natural_variation = natural_variation[["site", "entropy", "n_effective"]]

# Merge functional and natural dataframes
merged_df = (
    functional_scores.merge(
        natural_variation,
        how="left",
        on=["site"],
        validate="one_to_one",
    )
)

# Add escape to dataframe for each antibody
for antibody_file,_ in antibody_files:

    antibody_name = antibody_file.split("/")[-1].split("_")[0]

    # Load data as dataframe
    escape_df = pd.read_csv(antibody_file)

    # Clip lower scores to 0
    escape_df["escape_median"] = escape_df["escape_median"].clip(lower=0)

    # Calculate summed escape sites and get sites for top escape
    escape_df = (
        escape_df
        .groupby("site")
        .aggregate({"escape_median" : "sum"})
        .reset_index()
    )

    # Rename escape column to include antibody name
    escape_df = escape_df.rename(columns={"escape_median" : "escape_" + antibody_name})

    # Merge dataframes
    merged_df = (
        merged_df.merge(
            escape_df,
            how="left",
            on=["site"],
            validate="one_to_one",
        )
    )

# Total summed escape per site for all antibodies
merged_df["total_escape"] = (
    merged_df["escape_2510C"]
    + 
    merged_df["escape_121F"]
    +
    merged_df["escape_377H"]
    +
    merged_df["escape_256A"]
    +
    merged_df["escape_372D"]
    +
    merged_df["escape_89F"]
)

# Sites of mutations for chosen validations
validation_sites = [89, 126, 228, 398, 401]
merged_df["site of validation"] = (
    merged_df["site"].apply(lambda x: True if x in validation_sites else False)
)

First, we look at the correlation of functional effects and antibody escape stratified by the different antibodies. As expected, the mutations that lead to escape tend to be more functionally tolerated. Furthermore, the mutations present in the natural isolates also tend to be functionally tolerated except for the notable exception of N89D which is quite deleterious. 


In [None]:
# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

subplots = []
for index,antibody in enumerate(antibody_files):
    antibody_name = antibody[0].split("/")[-1].split("_")[0]

    curr_subplot = alt.Chart(merged_df, title=antibody_name).mark_point(
        filled=True, 
        color="black", 
        size=75
    ).encode(
        alt.X(
            "escape_" + antibody_name,
            axis=alt.Axis(
                title="total site escape", 
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domainMin=-1)
        ),
        alt.Y(
            "effect",
            axis=alt.Axis(
                title="effect on cell entry", 
                values=[-4,-3,-2,-1,0,1],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[-4.5,1])
        ),
        tooltip=[
            "site",
            "wildtype",
            "effect",
            "escape_" + antibody_name,
        ],
        color=alt.Color(
            "site of validation", 
            scale=alt.Scale(domain=dom, range=rng),
        ), 
    ).properties(
        width=150,
        height=150,
    )
    
    subplots.append(curr_subplot)

func_vs_antibody = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title="Correlations of functional effects and antibody escape",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

func_vs_antibody

Next, we look at the correlation of natural sequence diversity and antibody escape stratified by the different antibodies. The antibody 12.1F (and maybe 37.2D to a lesser extent) tends to be weakly escaped by mutations at sites with higher natural diversity. Of the chosen validations, site 126 tends to be most diverse but sites 228 and 398 also have increased diversity.

In [None]:
# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

subplots = []
for index,antibody in enumerate(antibody_files):
    antibody_name = antibody[0].split("/")[-1].split("_")[0]

    curr_subplot = alt.Chart(merged_df, title=antibody_name).mark_point(
        filled=True, 
        color="black", 
        size=75
    ).encode(
        alt.X(
            "escape_" + antibody_name,
            axis=alt.Axis(
                title="total site escape", 
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domainMin=-1)
        ),
        alt.Y(
            "n_effective",
            axis=alt.Axis(
                title=["effective amino acids", "in natural sequences"], 
                values=[1,2,3,4],
                domainWidth=1,
                domainColor="black",
                tickColor="black",
            ),
            scale=alt.Scale(domain=[0.9,4])
        ),
        tooltip=[
            "site",
            "wildtype",
            "n_effective",
            "escape_" + antibody_name,
        ],
        color=alt.Color(
            "site of validation", 
            scale=alt.Scale(domain=dom, range=rng),
        ), 
    ).properties(
        width=150,
        height=150,
    )
    
    subplots.append(curr_subplot)

func_vs_antibody = alt.hconcat(
    subplots[0],
    subplots[1],
    subplots[2],
    subplots[3],
    subplots[4],
    subplots[5],
    spacing=5,
    title="Correlations of natural sequence diversity and antibody escape",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

func_vs_antibody

Finally, we are going to look at summary correlations for antibody escape, functional effects on cell entry, and natural sequence diversity. The total antibody escape was calculated by summing escape at every site across all antibodies mapped.

In [None]:
# Making two lists for values and colors 
dom = [True, False] 
rng = ["#EE7733FF", "#00000026"] 

# Calculate statistics
r, p = sp.stats.pearsonr(
    merged_df[["total_escape","effect"]].dropna()["total_escape"],
    merged_df[["total_escape","effect"]].dropna()["effect"],
)
print(f"r correlation of total escape and effect on cell entry: {r:.2f}")

effect_vs_antibody = alt.Chart(merged_df, title=f"r = {r:.2f}").mark_point(
    filled=True, 
    color="black", 
    size=75,
).encode(
    alt.X(
        "total_escape",
        axis=alt.Axis(
            title=["total site escape across", "all mapped antibodies"],  
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domainMin=-1)
    ),
    alt.Y(
        "effect",
        axis=alt.Axis(
            title="effect on cell entry", 
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5,1])
    ),
    tooltip=[
        "site",
        "wildtype",
        "effect",
        "escape_2510C",
        "escape_121F",
        "escape_89F",
        "escape_377H",
        "escape_372D",
        "escape_256A",
        "total_escape",
    ],
    color=alt.Color(
        "site of validation", 
        scale=alt.Scale(domain=dom, range=rng),
    ), 
).properties(
    width=300,
    height=300,
)

r, p = sp.stats.pearsonr(
    merged_df[["total_escape","n_effective"]].dropna()["total_escape"],
    merged_df[["total_escape","n_effective"]].dropna()["n_effective"],
)
print(f"r correlation of total escape and natural sequence diversity: {r:.2f}")

natural_vs_antibody = alt.Chart(merged_df, title=f"r = {r:.2f}").mark_point(
    filled=True, 
    color="black", 
    size=75,
).encode(
    alt.Y(
        "total_escape",
        axis=alt.Axis(
            title=["total site escape across", "all mapped antibodies"], 
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domainMin=-1)
    ),
    alt.X(
        "n_effective",
        axis=alt.Axis(
            title=["effective amino acids", "in natural sequences"], 
            values=[1,2,3,4],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[0.9,4])
    ),
    tooltip=[
        "site",
        "wildtype",
        "n_effective",
        "escape_2510C",
        "escape_121F",
        "escape_89F",
        "escape_377H",
        "escape_372D",
        "escape_256A",
        "total_escape",
    ],
    color=alt.Color(
        "site of validation", 
        scale=alt.Scale(domain=dom, range=rng),
    ), 
).properties(
    width=300,
    height=300,
)

r, p = sp.stats.pearsonr(
    merged_df[["effect","n_effective"]].dropna()["effect"],
    merged_df[["effect","n_effective"]].dropna()["n_effective"],
)
print(f"r correlation of natural sequence diversity and effect on cell entry: {r:.2f}")

natural_vs_func = alt.Chart(merged_df, title=f"r = {r:.2f}").mark_point(
    filled=True, 
    color="black", 
    size=75,
).encode(
    alt.X(
        "n_effective",
        axis=alt.Axis(
            title=["effective amino acids", "in natural sequences"], 
            values=[1,2,3,4],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[0.9,4])
    ),
    alt.Y(
        "effect",
        axis=alt.Axis(
            title="effect on cell entry", 
            values=[-4,-3,-2,-1,0,1],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
        ),
        scale=alt.Scale(domain=[-4.5,1])
    ),
    tooltip=[
        "site",
        "wildtype",
        "effect",
        "n_effective",
        "escape_2510C",
        "escape_121F",
        "escape_89F",
        "escape_377H",
        "escape_372D",
        "escape_256A",
        "total_escape",
    ],
    color=alt.Color(
        "site of validation", 
        scale=alt.Scale(domain=dom, range=rng),
    ), 
).properties(
    width=300,
    height=300,
)

summary_corr_plot = alt.hconcat(
    effect_vs_antibody,
    natural_vs_antibody,
    natural_vs_func,
    spacing=5,
    title="Correlations of summary statistics",
).configure_axis(
    grid=False,
    labelFontSize=16,
    titleFontSize=16,
    labelFontWeight="normal",
    titleFontWeight="normal",
).configure_title(
    fontSize=24,
)

summary_corr_plot