# Affinity effects of non-RBD mutations in natural sequences

In [1]:
# this cell is tagged as `parameters` for `papermill` parameterization
dms_summary_csv = None
pango_consensus_seqs_json = None

In [53]:
dms_summary_csv = "results/summaries/summary.csv"
pango_consensus_seqs_json = "results/compare_natural/pango-consensus-sequences_summary.json"

starting_clades = ["XBB"]

init_min_cell_entry = -1.5
init_affinity_lower = -1.5
init_affinity_upper = 1.5
affinity_bin_width = 0.1
init_min_clade_count = 2

import os
os.chdir("../")

In [3]:
import collections
import json

import altair as alt

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

Get new spike mutations in Pango clades descended from starting clades:

In [4]:
dms_summary = pd.read_csv(dms_summary_csv).rename(
    columns={
        "spike mediated entry": "cell entry",
        "human sera escape": "sera escape",
    }
)

with open(pango_consensus_seqs_json) as f:
    pango_clades = json.load(f)

def build_records(c, recs):
    """Build records of Pango clade information."""
    if c in recs["clade"]:
        return
    recs["clade"].append(c)
    recs["date"].append(pango_clades[c]["designationDate"])
    recs["new_spike_muts"].append(
        [
            mut.split(":")[1]
            for field in ["aaSubstitutionsNew", "aaDeletionsNew"]
            for mut in pango_clades[c][field]
            if mut.startswith("S:")
        ]
    )
    for c_child in pango_clades[c]["children"]:
        build_records(c_child, recs)
        
records = collections.defaultdict(list)
for starting_clade in starting_clades:
    build_records(starting_clade, records)

pango_df = pd.DataFrame(records).query("clade not in @starting_clades")

Get the counts of how many times each mutation newly occurs in a clade:

In [5]:
mut_counts = (
    pango_df
    .explode("new_spike_muts")
    .query("new_spike_muts.notnull()")
    .rename(columns={"new_spike_muts": "mutation"})
    .groupby("mutation", as_index=False)
    .aggregate(count=pd.NamedAgg("clade", "count"))
    .assign(site=lambda x: x["mutation"].str[1: -1].astype(int))
    .merge(
        dms_summary[["site", "region"]].drop_duplicates(),
        how="left",
        validate="many_to_one",
    )
    .query("region.notnull()")
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

Add DMS phenotypes:

In [6]:
# specify DMS phenotypes of interest
phenotypes = [
    "sera escape",
    "ACE2 affinity",
    "cell entry",
]
assert set(phenotypes).issubset(dms_summary.columns)

# dict that maps site to wildtype in DMS
dms_wt = dms_summary.set_index("site")["wildtype"].to_dict()

# dict that maps site to region in DMS
site_to_region = dms_summary.set_index("site")["region"].to_dict()

dms_data = (
    dms_summary
    .set_index(["site", "wildtype", "mutant"])
    [phenotypes]
    .to_dict(orient="index")
)

def mut_dms(m):
    """Get DMS phenotypes for a mutation."""
    null_d = {k: pd.NA for k in phenotypes}
    if pd.isnull(m) or int(m[1: -1]) not in dms_wt:
        d = null_d
        d["is_RBD"] = pd.NA
    else:
        parent = m[0]
        site = int(m[1: -1])
        mut = m[-1]
        wt = dms_wt[site]
        if parent == wt:
            try:
                d = dms_data[(site, parent, mut)]
            except KeyError:
                d = null_d
        elif mut == wt:
            try:
                d = {k: -v for (k, v) in dms_data[(site, mut, parent)].items()}
            except KeyError:
                d = null_d
        else:
            try:
                parent_d = dms_data[(site, wt, parent)]
                mut_d = dms_data[(site, wt, mut)]
                d = {p: mut_d[p] - parent_d[p] for p in phenotypes}
            except KeyError:
                d = null_d
        d["is_RBD"] = (site_to_region[site] == "RBD")
    assert list(d) == phenotypes + ["is_RBD"]
    return d

for phenotype in phenotypes:
    mut_counts[phenotype] = mut_counts["mutation"].map(
        lambda m: mut_dms(m)[phenotype]
    )

mut_counts.sort_values("ACE2 affinity")

Unnamed: 0,mutation,count,site,region,sera escape,ACE2 affinity,cell entry
5,L455F,10,455,RBD,0.1516,-0.9594,0.1212
195,L455W,1,455,RBD,0.2602,-0.8535,-0.06305
74,N856K,1,856,S2,0.3972,-0.8378,0.06736
148,F486A,1,486,RBD,0.8999,-0.8261,0.3447
103,V445S,1,445,RBD,0.01139,-0.7449,-0.00691
...,...,...,...,...,...,...,...
140,E554A,1,554,other,,,
151,E484T,1,484,RBD,,,3.13687
193,L242I,1,242,NTD,,,0.07593
206,G257V,1,257,NTD,,,-0.0315


Get data frame of all mutations with DMS data:

In [7]:
all_muts_dms = (
    dms_summary
    .query("wildtype != mutant")
    .assign(mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"])
    [["mutation", "region", *phenotypes]]
)
all_muts_dms

Unnamed: 0,mutation,region,sera escape,ACE2 affinity,cell entry
0,F2C,other,0.01114,0.02151,0.10100
1,F2L,other,0.01876,-0.26980,0.09432
2,F2S,other,0.03169,-0.05642,0.05844
4,V3A,other,0.02402,-0.04977,-0.04154
5,V3F,other,0.11270,0.11390,-0.10570
...,...,...,...,...,...
8436,L1193M,S2,,0.04759,0.04979
8437,L1193P,S2,,,-6.42900
8439,K1211E,S2,,,-0.68540
8440,K1211R,S2,,,-0.52650


In [8]:
cell_entry_slider = alt.param(
    value=init_min_cell_entry,
    bind=alt.binding_range(
        name="minimum cell_entry",
        min=all_muts_dms["cell entry"].min(),
        max=0,
    ),
)

affinity_lower_slider = alt.param(
    value=init_affinity_lower,
    bind=alt.binding_range(
        name="clip ACE2 affinity at this lower bound",
        min=all_muts_dms["ACE2 affinity"].min(),
        max=0,
    ),
)

affinity_upper_slider = alt.param(
    value=init_affinity_upper,
    bind=alt.binding_range(
        name="clip ACE2 affinity at this upper bound",
        max=all_muts_dms["ACE2 affinity"].max(),
        min=0,
    ),
)

density_plot = (
    alt.Chart(all_muts_dms)
    .transform_filter(alt.datum["cell entry"] >= cell_entry_slider)
    .transform_calculate(
        affinity_clipped=alt.expr.min(
            alt.expr.max(alt.datum["ACE2 affinity"], affinity_lower_slider),
            affinity_upper_slider,
        ),
    )
    .transform_density(
        "affinity_clipped",
        as_=["affinity_clipped", "density"],
        groupby=["region"],
    )
    .encode(
        x=alt.X("affinity_clipped:Q", scale=alt.Scale(nice=False)),
        y=alt.Y("density:Q"),
        column="region",
    )
    .mark_area()
    .properties(width=180, height=150)
    .add_params(cell_entry_slider, affinity_lower_slider, affinity_upper_slider)
)

density_plot

In [68]:
cell_entry_slider = alt.param(
    value=init_min_cell_entry,
    bind=alt.binding_range(
        name="minimum cell_entry",
        min=all_muts_dms["cell entry"].min(),
        max=0,
    ),
)

affinity_lower_slider = alt.param(
    value=init_affinity_lower,
    bind=alt.binding_range(
        name="clip ACE2 affinity at this lower bound",
        min=all_muts_dms["ACE2 affinity"].min(),
        max=0,
    ),
)

affinity_upper_slider = alt.param(
    value=init_affinity_upper,
    bind=alt.binding_range(
        name="clip ACE2 affinity at this upper bound",
        max=all_muts_dms["ACE2 affinity"].max(),
        min=0,
    ),
)

density_hist = (
    alt.Chart(all_muts_dms)
    .transform_filter(alt.datum["cell entry"] >= cell_entry_slider)
    .transform_calculate(
        affinity_clipped=alt.expr.min(
            alt.expr.max(alt.datum["ACE2 affinity"], affinity_lower_slider),
            affinity_upper_slider,
        ),
    )
    .encode(
        x=alt.X(
            "affinity_clipped:Q",
            scale=alt.Scale(nice=False),
            bin=alt.BinParams(step=affinity_bin_width),
            title="ACE2 affinity",
        ),
        y=alt.Y("count():Q", title="number of mutations"),
        column="region",
    )
    .mark_bar()
    .properties(width=180, height=150)
    .add_params(cell_entry_slider, affinity_lower_slider, affinity_upper_slider)
)

density_hist

In [69]:
clade_count_slider = alt.param(
    value=init_min_clade_count,
    bind=alt.binding_range(
        name="minimum clades with new occurrence of mutation",
        min=1,
        step=1,
        max=min(10, mut_counts["count"].max()),
    ),
)

wilkinson_dot_plot = (
    alt.Chart(
        mut_counts
        .rename(columns={"ACE2 affinity": "affinity"})
        .query("affinity.notnull()")
    )
    .transform_filter(alt.datum["cell entry"] >= cell_entry_slider)
    .transform_filter(alt.datum["count"] >= clade_count_slider)
    .transform_calculate(
        affinity_clipped=alt.expr.min(
            alt.expr.max(alt.datum["ACE2 affinity"], affinity_lower_slider),
            affinity_upper_slider,
        ),
    )
    .transform_bin(
        "binned_affinity",
        field="affinity",
        bin=alt.BinParams(step=affinity_bin_width),
    )
    .transform_window(
        rank="rank()",
        groupby=["binned_affinity", "region"],
    )
    .encode(
        x=alt.X("binned_affinity:Q"),
        tooltip=[
            "mutation",
            alt.Tooltip("count", title="clade count"),
            alt.Tooltip("affinity", title="ACE2 affinity"),
        ],
        y=alt.Y("rank:O", sort="descending"),
        column="region",
    )
    .mark_circle()
    .properties(width=180, height=150)
    .add_params(
        cell_entry_slider,
        affinity_lower_slider,
        affinity_upper_slider,
        clade_count_slider,
    )
)

chart = (
    (density_hist &  wilkinson_dot_plot)
)

chart

In [74]:
all_muts_dms.query("mutation == 'A570D'")

Unnamed: 0,mutation,region,sera escape,ACE2 affinity,cell entry
4535,A570D,other,-0.5749,1.234,-1.375


In [11]:
dms_summary

Unnamed: 0,site,wildtype,mutant,sera escape,cell entry,ACE2 affinity,sequential_site,region
0,2,F,C,0.01114,0.10100,0.02151,2,other
1,2,F,L,0.01876,0.09432,-0.26980,2,other
2,2,F,S,0.03169,0.05844,-0.05642,2,other
3,2,F,F,0.00000,0.00000,0.00000,2,other
4,3,V,A,0.02402,-0.04154,-0.04977,3,other
...,...,...,...,...,...,...,...,...
8439,1211,K,E,,-0.68540,,1207,S2
8440,1211,K,R,,-0.52650,,1207,S2
8441,1211,K,K,0.00000,0.00000,0.00000,1207,S2
8442,1212,W,R,,-2.37700,-0.21050,1208,other
