# Compare DMS to natural sequence evolution

In [1]:
# this cell is tagged parameters for papermill parameterization
dms_summary_csv = None
pango_consensus_seqs_json = None
starting_clade = None
dms_clade = None
n_random = None
exclude_clades = None

In [2]:
import os
os.chdir("../")

dms_summary_csv = "results/summaries/summary.csv"
pango_consensus_seqs_json = "pango-consensus-sequences_summary.json"
starting_clade = "XBB"
dms_clade = "XBB.1.5"
n_random = 10
exclude_clades = ["HK.3.1"]  # https://github.com/corneliusroemer/pango-sequences/issues/6

In [3]:
import collections
import json
import re

import altair as alt

import numpy

import pandas as pd

_ = alt.data_transformers.disable_max_rows()

## Read Pango clades and mutations

In [4]:
with open(pango_consensus_seqs_json) as f:
    pango_clades = json.load(f)

def n_child_clades(c):
    """Get number of children clades of a Pango clade."""
    direct_children = pango_clades[c]["children"]
    return len(direct_children) + sum([n_child_clades(c_child) for c_child in direct_children])

def build_records(c, recs):
    """Build records of Pango clade information."""
    recs["clade"].append(c)
    recs["n_child_clades"].append(n_child_clades(c))
    recs["date"].append(pango_clades[c]["designationDate"])
    recs["mutations_from_ref"].append(
        [
            mut.split(":")[1]
            for field in ["aaSubstitutions", "aaDeletions"]
            for mut in pango_clades[c][field]
            if mut.startswith("S:")
        ]
    )
    for c_child in pango_clades[c]["children"]:
        build_records(c_child, recs)
        
records = collections.defaultdict(list)
build_records(starting_clade, records)

pango_df = pd.DataFrame(records).query("clade not in @exclude_clades")
starting_clade_mutations_from_ref = pango_df.set_index("clade").at[
    starting_clade, "mutations_from_ref"
]
dms_clade_mutations_from_ref = pango_df.set_index("clade").at[
    dms_clade, "mutations_from_ref"
]

def mutations_from(muts, from_muts):
    """Get mutations from another sequence."""
    new_muts = set(muts).symmetric_difference(from_muts)
    assert all(re.fullmatch("[A-Z\-]\d+[A-Z\-]", m) for m in new_muts)
    new_muts_d = collections.defaultdict(list)
    for m in new_muts:
        new_muts_d[int(m[1: -1])].append(m)
    new_muts_list = []
    for _, ms in sorted(new_muts_d.items()):
        if len(ms) == 1:
            m = ms[0]
            if m in muts:
                new_muts_list.append(m)
            else:
                assert m in from_muts
                new_muts_list.append(m[-1] + m[1: -1] + m[0])
        else:
            m, from_m = ms
            if m not in muts:
                from_m, m = m, from_m
            assert m in muts and from_m in from_muts
            new_muts_list.append(from_m[-1] + m[1: ])
    return new_muts_list

pango_df = (
    pango_df
    .assign(
        mutations_from_start_clade=lambda x: x["mutations_from_ref"].apply(
            mutations_from, args=(starting_clade_mutations_from_ref,),
        ).map(lambda ml: "; ".join(ml)),
        mutations_from_dms_clade=lambda x: x["mutations_from_ref"].apply(
            mutations_from, args=(dms_clade_mutations_from_ref,),
        ),
        date=lambda x: pd.to_datetime(x["date"]),
    )
    .drop(columns="mutations_from_ref")
    .sort_values("date")
    .reset_index(drop=True)
)

pango_df

Unnamed: 0,clade,n_child_clades,date,mutations_from_start_clade,mutations_from_dms_clade
0,XBB,606,2022-09-17,,"[V252G, P486S]"
1,XBB.1,528,2022-10-03,G252V,[P486S]
2,XBB.1.1,0,2022-10-15,G252V,[P486S]
3,XBB.2,60,2022-10-15,D253G,"[V252G, D253G, P486S]"
4,XBB.3,5,2022-10-15,,"[V252G, P486S]"
...,...,...,...,...,...
601,JG.2,0,2023-09-04,Q52H; G75V; G252V; F456L; S486P,"[Q52H, G75V, F456L]"
602,XBB.1.16.24,0,2023-09-04,E180V; K182E; G252V; K478R; S486P; P521T; E554K,"[E180V, K182E, K478R, P521T, E554K]"
603,HK.11,0,2023-09-04,Q52H; G184S; G252V; F456L; S486P,"[Q52H, G184S, F456L]"
604,FY.4.1.2,0,2023-09-06,Y200C; G252V; Y451H; S486P; S494P,"[Y200C, Y451H, S494P]"


## Assign DMS phenotypes to Pango clades

First define function that assigns DMS phenotypes to mutations:

In [5]:
# read the DMS data
dms_summary = pd.read_csv(dms_summary_csv)

# specify DMS phenotypes of interest
phenotypes = [
    "human sera escape",
    "spike mediated entry",
    "ACE2 affinity",
]
assert set(phenotypes).issubset(dms_summary.columns)

# dict that maps site to wildtype in DMS
dms_wt = dms_summary.set_index("site")["wildtype"].to_dict()

# dict that maps site to region in DMS
site_to_region = dms_summary.set_index("site")["region"].to_dict()

def mut_dms(m, dms_data):
    """Get DMS phenotypes for a mutation."""
    null_d = {k: pd.NA for k in phenotypes}
    if pd.isnull(m) or int(m[1: -1]) not in dms_wt:
        d = null_d
        d["is_RBD"] = pd.NA
    else:
        parent = m[0]
        site = int(m[1: -1])
        mut = m[-1]
        wt = dms_wt[site]
        if parent == wt:
            try:
                d = dms_data[(site, parent, mut)]
            except KeyError:
                d = null_d
        elif mut == wt:
            try:
                d = {k: -v for (k, v) in dms_data[(site, mut, parent)].items()}
            except KeyError:
                d = null_d
        else:
            try:
                parent_d = dms_data[(site, wt, parent)]
                mut_d = dms_data[(site, wt, mut)]
                d = {p: mut_d[p] - parent_d[p] for p in phenotypes}
            except KeyError:
                d = null_d
        d["is_RBD"] = (site_to_region[site] == "RBD")
    assert list(d) == phenotypes + ["is_RBD"]
    return d

Now assign phenotypes to pango clades.
We do this both using the actual DMS data and randomizing the DMS data among measured mutations:

In [6]:

def get_pango_dms_df(dms_data_dict):
    """Given dict mapping mutations to DMS data, get data frame of values for Pango clades."""
    pango_dms_df = (
        pango_df
        # put one mutation in each column
        .explode("mutations_from_dms_clade")
        .rename(columns={"mutations_from_dms_clade": "mutation"})
        # to add multiple columns: https://stackoverflow.com/a/46814360
        .apply(
            lambda cols: pd.concat([cols, pd.Series(mut_dms(cols["mutation"], dms_data_dict))]),
            axis=1,
        )
        .melt(
            id_vars=["clade", "date", "n_child_clades", "mutations_from_start_clade", "mutation", "is_RBD"],
            value_vars=phenotypes,
            var_name="DMS_phenotype",
            value_name="mutation_effect",
        )
        .assign(
            mutations_from_dms_clade=lambda x: x.groupby(["clade", "DMS_phenotype"])["mutation"].transform(
                lambda ms: "; ".join([m for m in ms if not pd.isnull(m)])
            ),
            mutation_missing=lambda x: x["mutation"].where(
                x["mutation_effect"].isnull() & x["mutation"].notnull(),
                pd.NA,
            ),
            mutations_from_dms_clade_missing_data=lambda x: (
                x.groupby(["clade", "DMS_phenotype"])["mutation_missing"]
                .transform(lambda ms: "; ".join([m for m in ms if not pd.isnull(m)]))
            ),
            mutation_effect=lambda x: x["mutation_effect"].fillna(0),
            is_RBD=lambda x: x["is_RBD"].fillna(False),
            mutation_effect_RBD=lambda x: x["mutation_effect"] * x["is_RBD"].astype(int),
            mutation_effect_nonRBD=lambda x: x["mutation_effect"] * (~x["is_RBD"]).astype(int),
        )
        .groupby(
            [
                "clade",
                "date",
                "n_child_clades",
                "mutations_from_start_clade",
                "mutations_from_dms_clade",
                "mutations_from_dms_clade_missing_data",
                "DMS_phenotype",
            ],
            as_index=False,
        )
        .aggregate(
            phenotype=pd.NamedAgg("mutation_effect", "sum"),
            phenotype_RBD_only=pd.NamedAgg("mutation_effect_RBD", "sum"),
            phenotype_nonRBD_only=pd.NamedAgg("mutation_effect_nonRBD", "sum"),
        )
        .rename(
            columns={
                "mutations_from_start_clade": f"mutations_from_{starting_clade}",
                "mutations_from_dms_clade": f"mutations_from_{dms_clade}",
                "mutations_from_dms_clade_missing_data": f"mutations_from_{dms_clade}_missing_data",
            },
        )
        .sort_values(["date", "DMS_phenotype"])
        .reset_index(drop=True)
    )
    
    assert set(pango_df["clade"]) == set(pango_dms_df["clade"])
    assert numpy.allclose(
        pango_dms_df["phenotype"],
        pango_dms_df["phenotype_RBD_only"] + pango_dms_df["phenotype_nonRBD_only"]
    )

    return pango_dms_df

# First, get the actual DMS data mapped to phenotype
dms_data_dict_actual = (
    dms_summary
    .set_index(["site", "wildtype", "mutant"])
    [phenotypes]
    .to_dict(orient="index")
)
pango_dms_df = get_pango_dms_df(dms_data_dict_actual)

# Now get the randomized DMS data mapped to phenotype
pango_dms_dfs_rand = []
for irandom in range(1, n_random + 1):
    # randomize the non-null DMS data for each phenotype
    dms_summary_rand = dms_summary.copy()
    for phenotype in phenotypes:
        dms_summary_rand = pd.concat(
            [
                dms_summary_rand[dms_summary_rand[phenotype].isnull()],
                (
                    dms_summary_rand[dms_summary_rand[phenotype].notnull()]
                    .set_index([c for c in dms_summary_rand.columns if c != phenotype])
                    .sample(frac=1, random_state=irandom)
                    .reset_index()
                )
            ]
        )
        assert dms_summary_rand.shape == dms_summary.shape
        dms_data_dict_rand = (
            dms_summary_rand
            .set_index(["site", "wildtype", "mutant"])
            [phenotypes]
            .to_dict(orient="index")
        )
        pango_dms_dfs_rand.append(get_pango_dms_df(dms_data_dict_rand).assign(randomize=irandom))
# all randomizations concatenated
pango_dms_df_rand = pd.concat(pango_dms_dfs_rand)
# average across randomizations
cols_to_avg = ["phenotype", "phenotype_RBD_only", "phenotype_nonRBD_only"]
pango_dms_df_rand_avg = (
    pango_dms_df_rand
    .groupby(
        [c for c in pango_dms_df.columns if c not in cols_to_avg],
        as_index=False,
    )
    .aggregate({c: "sum" for c in cols_to_avg})
)

## Plot phenotypes of Pango clades
Plot phenotypes of Pango clades versus their designation dates:

In [13]:
region_cols = {
    "phenotype": "full spike",
    "phenotype_RBD_only": "RBD only",
    "phenotype_nonRBD_only": "non-RBD only",
}

pango_chart_df = (
    pango_dms_df
    .melt(
        id_vars=[c for c in pango_dms_df if c not in region_cols],
        value_vars=region_cols,
        var_name="spike_region",
        value_name="phenotype value",
    )
    .assign(
        spike_region=lambda x: x["spike_region"].map(region_cols),
        n_mutations=lambda x: x[f"mutations_from_{starting_clade}"].map(
            lambda s: len([m for m in s.split(";") if m])
        )
    )
)

if pango_chart_df["n_mutations"].max() > 12:
    raise ValueError(
        "check high number of mutations to ensure not bug in JSON like this one:\n"
        + "https://github.com/corneliusroemer/pango-sequences/issues/6\n\n"
        + str(pango_chart_df.query("n_mutations > 12"))
    )

pango_chart = (
    alt.Chart(pango_chart_df)
    .encode(
        x=alt.X("date", title="designation date of clade"),
        y=alt.Y("phenotype value", title=None),
        row=alt.Row(
            "DMS_phenotype",
            title=None,
            header=alt.Header(labelFontSize=12, labelFontStyle="bold"),
        ),
        column=alt.Column(
            "spike_region",
            sort=list(region_cols),
            title=None,
            header=alt.Header(labelFontSize=12, labelFontStyle="bold"),
        ),
        tooltip=[c for c in pango_chart_df.columns],
    )
    .mark_circle(opacity=0.5)
    .properties(width=300, height=150)
    .configure_axis(grid=False)
)

pango_chart

In [8]:
pango_chart_df

Unnamed: 0,clade,date,n_child_clades,mutations_from_XBB,mutations_from_XBB.1.5,mutations_from_XBB.1.5_missing_data,DMS_phenotype,spike_region,phenotype value,n_mutations
0,XBB,2022-09-17,606,,V252G; P486S,,ACE2 affinity,full spike,-0.60347,0
1,XBB,2022-09-17,606,,V252G; P486S,,human sera escape,full spike,-0.15530,0
2,XBB,2022-09-17,606,,V252G; P486S,,spike mediated entry,full spike,-0.21618,0
3,XBB.1,2022-10-03,528,G252V,P486S,,ACE2 affinity,full spike,-0.57830,1
4,XBB.1,2022-10-03,528,G252V,P486S,,human sera escape,full spike,-0.22690,1
...,...,...,...,...,...,...,...,...,...,...
5449,FY.4.1.2,2023-09-06,0,Y200C; G252V; Y451H; S486P; S494P,Y200C; Y451H; S494P,,human sera escape,non-RBD only,0.37270,5
5450,FY.4.1.2,2023-09-06,0,Y200C; G252V; Y451H; S486P; S494P,Y200C; Y451H; S494P,,spike mediated entry,non-RBD only,-0.07328,5
5451,EG.6.1.1,2023-09-07,0,L242I; G252V; F456L; A484K; S486P,L242I; F456L; A484K,L242I,ACE2 affinity,non-RBD only,0.00000,5
5452,EG.6.1.1,2023-09-07,0,L242I; G252V; F456L; A484K; S486P,L242I; F456L; A484K,L242I; A484K,human sera escape,non-RBD only,0.00000,5


In [9]:
"".split(";")

['']