# Escape at key sites: logo plots and binding / escape correlations
Make logo plots of serum escape at key sites, and look at relationship between escape and other phenotypes like ACE2 binding.

First get input files / parameters from `papermill` and import Python modules:

In [1]:
# this cell is tagged as `parameters` for papermill parameterization
dms_csv = None
per_antibody_csv = None
pango_consensus_seqs_json = None

In [2]:
# Parameters
pango_consensus_seqs_json = "https://raw.githubusercontent.com/corneliusroemer/pango-sequences/c64ef05e53debaa9cc65dd56d6eb83e31517179c/data/pango-consensus-sequences_summary.json"
dms_csv = "results/summaries/summary.csv"
per_antibody_csv="results/summaries/per_antibody_escape.csv"

import os
os.chdir("../")

In [16]:
import altair as alt

import dmslogo

import matplotlib.pyplot as plt

import numpy

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

## Read input data
Keep only mutations with all phenotypes measured:

In [4]:
# read averages for all DMS measurements
dms_df = (
    pd.read_csv(dms_csv)
    .rename(
        columns={"human sera escape": "sera escape", "spike mediated entry": "cell entry"}
    )
    .query("`sera escape`.notnull() and `cell entry`.notnull() and `sera escape`.notnull()")
)

# read per antibody values, merge with averages to create escape_df
per_antibody_df = pd.read_csv(per_antibody_csv)

assert per_antibody_df["antibody_set"].nunique() == 1, "code expects 1 antibody_set"

if (
    (intersection := set(dms_df.columns).intersection(per_antibody_df.columns))
    != {"site", "wildtype", "mutant"}
):
    raise ValueError(f"unexpected {intersection=}")

assert "average" not in per_antibody_df["antibody"]

escape_df = (
    pd.concat(
        [
            dms_df[["site", "wildtype", "mutant", "sera escape"]].rename(
                columns={"sera escape": "escape"}
            ).assign(antibody="average"),
            per_antibody_df.drop(columns="antibody_set"),
        ],
        ignore_index=True,
    )
    .merge(dms_df.drop(columns="sera escape"), validate="many_to_one")
    .assign(wildtype_site=lambda x: x["wildtype"] + x["site"].astype(str))
)

escape_df

Unnamed: 0,site,wildtype,mutant,escape,antibody,cell entry,ACE2 binding,sequential_site,region,wildtype_site
0,2,F,C,0.01114,average,0.101,0.02151,2,other,F2
1,2,F,C,-0.09066,serum 287C,0.101,0.02151,2,other,F2
2,2,F,C,-0.04613,serum 288C,0.101,0.02151,2,other,F2
3,2,F,C,0.16870,serum 343C,0.101,0.02151,2,other,F2
4,2,F,C,0.07028,serum 493C,0.101,0.02151,2,other,F2
...,...,...,...,...,...,...,...,...,...,...
61421,1152,L,L,0.00000,average,0.000,0.00000,1148,S2,L1152
61422,1193,L,L,0.00000,average,0.000,0.00000,1189,S2,L1193
61423,1211,K,K,0.00000,average,0.000,0.00000,1207,S2,K1211
61424,1212,W,W,0.00000,average,0.000,0.00000,1208,other,W1212


## Determine which sites to plot

In [44]:
# get the min, max, and total magnitude of escape at each site, both for averages
# and across all individual antibodies
site_escape_df = (
    escape_df
    .assign(is_average=lambda x: numpy.where(x["antibody"] == "average", "average of antibodies", "any antibody"))
    .groupby(["is_average", "antibody", "site", "sequential_site"], as_index=False)
    .aggregate(
        max_escape=pd.NamedAgg("escape", "max"),
        min_escape=pd.NamedAgg("escape", "min"),
        mag_escape=pd.NamedAgg("escape", lambda s: s.abs().sum()),
        mag_positive_escape=pd.NamedAgg("escape", lambda s: s.clip(lower=0).sum()),
        mag_negative_escape=pd.NamedAgg("escape", lambda s: s.clip(upper=0).abs().sum()),
    )
    .groupby(["is_average", "site", "sequential_site"], as_index=False)
    .aggregate({"max_escape": "max", "min_escape": "min", "mag_escape": "max", "mag_positive_escape": "max", "mag_negative_escape": "max"})
    .melt(id_vars=["is_average", "site", "sequential_site"], var_name="site metric", value_name="site escape")
)

site_selection = alt.selection_point(fields=["site"], on="mouseover", empty=False)

site_metric_selection = alt.selection_point(
    fields=["site metric"],
    value="mag_positive_escape",
    bind=alt.binding_select(
        name="site metric",
        options=site_escape_df["site metric"].unique(),
    ),
)

site_escape_chart = (
    alt.Chart(site_escape_df)
    .add_params(site_selection, site_metric_selection)
    .transform_filter(site_metric_selection)
    .encode(
        alt.X("site", sort=alt.SortField("sequential_site"), scale=alt.Scale(nice=False, zero=False)),
        alt.Y("site escape"),
        alt.Row("is_average", title=None),
        tooltip=[alt.Tooltip(c, format=".2f") if site_escape_df[c].dtype == float else c for c in site_escape_df.columns],
        strokeWidth=alt.condition(site_selection, alt.value(2), alt.value(0)),
        opacity=alt.condition(site_selection, alt.value(1), alt.value(0.35)),
        size=alt.condition(site_selection, alt.value(70), alt.value(30)),
    )
    .mark_circle(fill="black", stroke="red")
    .configure_axis(grid=False)
    .resolve_scale(y="independent")
    .properties(
        width=600,
        height=150,
        title="Escape at each site for average of antibodies or max for any antibody",
    )
)

site_escape_chart

In [41]:
help(alt.binding_select)

Help on function binding_select in module altair.vegalite.v5.api:

binding_select(self, input=Undefined, options=Undefined, debounce=Undefined, element=Undefined, labels=Undefined, name=Undefined, **kwds)
    A select binding
    Mapping(required=[input, options])
    
    Parameters
    ----------
    
    input : enum('radio', 'select')
    
    options : List(Any)
        An array of options to select from.
    debounce : float
        If defined, delays event handling until the specified milliseconds have elapsed
        since the last event was fired.
    element : :class:`Element`
        An optional CSS selector string indicating the parent element to which the input
        element should be added. By default, all input elements are added within the parent
        container of the Vega view.
    labels : List(string)
        An array of label strings to represent the ``options`` values. If unspecified, the
        ``options`` value will be coerced to a string and used as the la