# Compare DMS to natural sequence evolution

In [1]:
# this cell is tagged parameters for papermill parameterization
dms_summary_csv = None
pango_consensus_seqs_json = None
fitness_csv = None
starting_clade = None

In [2]:
import os
os.chdir("../")

dms_summary_csv = "results/summaries/summary.csv"
pango_consensus_seqs_json = "pango-consensus-sequences_summary.json"
fitness_csv = "aa_fitness.csv"
starting_clade = "XBB"
dms_clade = "XBB.1.5"

In [3]:
import collections
import json
import re

import altair as alt

import numpy

import pandas as pd

## Read Pango clades and mutations

In [4]:
with open(pango_consensus_seqs_json) as f:
    pango_clades = json.load(f)

def n_children(c):
    """Get number of children clades of a Pango clade."""
    direct_children = pango_clades[c]["children"]
    return len(direct_children) + sum([n_children(c_child) for c_child in direct_children])

def build_records(c, recs):
    """Build records of Pango clade information."""
    recs["clade"].append(c)
    recs["n_children"].append(n_children(c))
    recs["date"].append(pango_clades[c]["designationDate"])
    recs["mutations_from_ref"].append(
        [
            mut.split(":")[1]
            for field in ["aaSubstitutions", "aaDeletions"]
            for mut in pango_clades[c][field]
            if mut.startswith("S:")
        ]
    )
    for c_child in pango_clades[c]["children"]:
        build_records(c_child, recs)
        
records = collections.defaultdict(list)
build_records(starting_clade, records)

pango_df = pd.DataFrame(records)
starting_clade_mutations_from_ref = pango_df.set_index("clade").at[
    starting_clade, "mutations_from_ref"
]
dms_clade_mutations_from_ref = pango_df.set_index("clade").at[
    dms_clade, "mutations_from_ref"
]

def mutations_from(muts, from_muts):
    """Get mutations from another sequence."""
    new_muts = set(muts).symmetric_difference(from_muts)
    assert all(re.fullmatch("[A-Z\-]\d+[A-Z\-]", m) for m in new_muts)
    new_muts_d = collections.defaultdict(list)
    for m in new_muts:
        new_muts_d[int(m[1: -1])].append(m)
    new_muts_list = []
    for _, ms in sorted(new_muts_d.items()):
        if len(ms) == 1:
            m = ms[0]
            if m in muts:
                new_muts_list.append(m)
            else:
                assert m in from_muts
                new_muts_list.append(m[-1] + m[1: -1] + m[0])
        else:
            m, from_m = ms
            if m not in muts:
                from_m, m = m, from_m
            assert m in muts and from_m in from_muts
            new_muts_list.append(from_m[-1] + m[1: ])
    return new_muts_list

pango_df = (
    pango_df
    .assign(
        mutations_from_start_clade=lambda x: x["mutations_from_ref"].apply(
            mutations_from, args=(starting_clade_mutations_from_ref,),
        ).map(lambda ml: "; ".join(ml)),
        mutations_from_dms_clade=lambda x: x["mutations_from_ref"].apply(
            mutations_from, args=(dms_clade_mutations_from_ref,),
        ),
        date=lambda x: pd.to_datetime(x["date"]),
    )
    .drop(columns="mutations_from_ref")
    .sort_values("date")
    .reset_index(drop=True)
)

pango_df

Unnamed: 0,clade,n_children,date,mutations_from_start_clade,mutations_from_dms_clade
0,XBB,606,2022-09-17,,"[V252G, P486S]"
1,XBB.1,528,2022-10-03,G252V,[P486S]
2,XBB.1.1,0,2022-10-15,G252V,[P486S]
3,XBB.2,60,2022-10-15,D253G,"[V252G, D253G, P486S]"
4,XBB.3,5,2022-10-15,,"[V252G, P486S]"
...,...,...,...,...,...
602,HK.11,0,2023-09-04,Q52H; G184S; G252V; F456L; S486P,"[Q52H, G184S, F456L]"
603,XBB.1.16.24,0,2023-09-04,E180V; K182E; G252V; K478R; S486P; P521T; E554K,"[E180V, K182E, K478R, P521T, E554K]"
604,JG.2,0,2023-09-04,Q52H; G75V; G252V; F456L; S486P,"[Q52H, G75V, F456L]"
605,FY.4.1.2,0,2023-09-06,Y200C; G252V; Y451H; S486P; S494P,"[Y200C, Y451H, S494P]"


## Assign DMS phenotypes

First define function that assigns DMS phenotypes to mutations:

In [5]:
dms_summary = pd.read_csv(dms_summary_csv)

# dict that maps (site, wt, mutant) to DMS phenotypes
phenotypes = [
    "human sera escape",
    "spike mediated entry",
    "ACE2 affinity",
]
dms_data = (
    dms_summary
    .set_index(["site", "wildtype", "mutant"])
    [phenotypes]
    .to_dict(orient="index")
)

# dict that maps site to wildtype in DMS
dms_wt = dms_summary.set_index("site")["wildtype"].to_dict()

# dict that maps site to region in DMS
site_to_region = dms_summary.set_index("site")["region"].to_dict()

def mut_dms(m):
    """Get DMS phenotypes for a mutation."""
    null_d = {k: pd.NA for k in phenotypes}
    if pd.isnull(m) or int(m[1: -1]) not in dms_wt:
        d = null_d
        d["is_RBD"] = pd.NA
    else:
        parent = m[0]
        site = int(m[1: -1])
        mut = m[-1]
        wt = dms_wt[site]
        if parent == wt:
            try:
                d = dms_data[(site, parent, mut)]
            except KeyError:
                d = null_d
        elif mut == wt:
            try:
                d = {k: -v for (k, v) in dms_data[(site, mut, parent)].items()}
            except KeyError:
                d = null_d
        else:
            try:
                parent_d = dms_data[(site, wt, parent)]
                mut_d = dms_data[(site, wt, mut)]
                d = {p: mut_d[p] - parent_d[p] for p in phenotypes}
            except KeyError:
                d = null_d
        d["is_RBD"] = (site_to_region[site] == "RBD")
    assert list(d) == phenotypes + ["is_RBD"]
    return d

Now assign phenotypes to pango clades:

In [12]:
pango_dms_df = (
    pango_df
    # put one mutation in each column
    .explode("mutations_from_dms_clade")
    .rename(columns={"mutations_from_dms_clade": "mutation"})
    # to add multiple columns: https://stackoverflow.com/a/46814360
    .apply(
        lambda cols: pd.concat([cols, pd.Series(mut_dms(cols["mutation"]))]),
        axis=1,
    )
    .melt(
        id_vars=["clade", "date", "n_children", "mutations_from_start_clade", "mutation", "is_RBD"],
        value_vars=phenotypes,
        var_name="DMS_phenotype",
        value_name="mutation_effect",
    )
    .assign(
        mutations_from_dms_clade=lambda x: x.groupby(["clade", "DMS_phenotype"])["mutation"].transform(
            lambda ms: "; ".join([m for m in ms if not pd.isnull(m)])
        ),
        mutation_missing=lambda x: x["mutation"].where(
            x["mutation_effect"].isnull() & x["mutation"].notnull(),
            pd.NA,
        ),
        mutations_from_dms_clade_missing_data=lambda x: (
            x.groupby(["clade", "DMS_phenotype"])["mutation_missing"]
            .transform(lambda ms: "; ".join([m for m in ms if not pd.isnull(m)]))
        ),
        mutation_effect=lambda x: x["mutation_effect"].fillna(0),
        is_RBD=lambda x: x["is_RBD"].fillna(False),
        mutation_effect_RBD=lambda x: x["mutation_effect"] * x["is_RBD"].astype(int),
        mutation_effect_nonRBD=lambda x: x["mutation_effect"] * (~x["is_RBD"]).astype(int),
    )
    .groupby(
        [
            "clade",
            "date",
            "n_children",
            "mutations_from_start_clade",
            "mutations_from_dms_clade",
            "mutations_from_dms_clade_missing_data",
            "DMS_phenotype",
        ],
        as_index=False,
    )
    .aggregate(
        phenotype=pd.NamedAgg("mutation_effect", "sum"),
        phenotype_RBD_only=pd.NamedAgg("mutation_effect_RBD", "sum"),
        phenotype_nonRBD_only=pd.NamedAgg("mutation_effect_nonRBD", "sum"),
    )
    .rename(
        columns={
            "mutations_from_start_clade": f"mutations_from_{starting_clade}",
            "mutations_from_dms_clade": f"mutations_from_{dms_clade}",
            "mutations_from_dms_clade_missing_data": f"mutations_from_{dms_clade}_missing_data",
        },
    )
    .sort_values(["date", "DMS_phenotype"])
    .reset_index(drop=True)
)

assert set(pango_df["clade"]) == set(pango_dms_df["clade"])
assert numpy.allclose(
    pango_dms_df["phenotype"],
    pango_dms_df["phenotype_RBD_only"] + pango_dms_df["phenotype_nonRBD_only"]
)

pango_dms_df.query("`mutations_from_XBB.1.5_missing_data` != ''")

Unnamed: 0,clade,date,n_children,mutations_from_XBB,mutations_from_XBB.1.5,mutations_from_XBB.1.5_missing_data,DMS_phenotype,phenotype,phenotype_RBD_only,phenotype_nonRBD_only
108,XBB.1.13,2023-02-10,1,G252V; S490P,P486S; S490P,S490P,ACE2 affinity,-0.5783,-0.5783,0.0
117,XBB.1.13,2023-02-10,1,G252V; S490P,P486S; S490P,S490P,human sera escape,-0.2269,-0.2269,0.0
342,XBB.2.7.1,2023-03-24,0,D253G; Y453F; K478Q,V252G; D253G; Y453F; K478Q; P486S,K478Q,human sera escape,-0.35755,-0.19885,-0.1587
368,XBB.1.5.38,2023-03-27,1,G252V; S486P; I666V,I666V,I666V,human sera escape,0.0,0.0,0.0
390,XBB.2.3.4,2023-04-04,1,D253G; K478Q; S486P; P521S,V252G; D253G; K478Q; P521S,K478Q,human sera escape,-0.10877,0.04993,-0.1587
405,FP.1,2023-04-11,0,-24L; S27-; H69-; V70-; G252V; S486P,-24L; S27-; H69-; V70-,-24L,ACE2 affinity,0.22322,0.0,0.22322
406,FP.1,2023-04-11,0,-24L; S27-; H69-; V70-; G252V; S486P,-24L; S27-; H69-; V70-,-24L,human sera escape,0.10244,0.0,0.10244
407,FP.1,2023-04-11,0,-24L; S27-; H69-; V70-; G252V; S486P,-24L; S27-; H69-; V70-,-24L,spike mediated entry,-0.47663,0.0,-0.47663
434,FT.1,2023-04-22,0,I197V; G252V; S486P,I197V,I197V,ACE2 affinity,0.0,0.0,0.0
452,FT.1,2023-04-22,0,I197V; G252V; S486P,I197V,I197V,human sera escape,0.0,0.0,0.0
