# Summarize results across assays
This notebook makes summarizes the results across assays.

In [1]:
import functools
import operator
import re

import altair as alt

import pandas as pd

import polyclonal.alphabets
from polyclonal.plot import color_gradient_hex

_ = alt.data_transformers.disable_max_rows()

The next cell is tagged as `parameters` for `papermill` parameterization:

In [2]:
site_numbering_map_csv = None
chart_overlaid = None
chart_faceted = None
csv_file = None
config = None
input_csvs = None

In [3]:
# Parameters
config = {
    "min_times_seen": 3,
    "min_frac_models": 1,
    "alphabet": [
        "A",
        "C",
        "D",
        "E",
        "F",
        "G",
        "H",
        "I",
        "K",
        "L",
        "M",
        "N",
        "P",
        "Q",
        "R",
        "S",
        "T",
        "V",
        "W",
        "Y",
        "-",
    ],
    "init_floor_escape_at_zero": True,
    "init_site_escape_stat": "mean",
    "antibody_escape": {
        "monoclonal antibodies": {
            "stat": "escape_median",
            "negative_color": "#0072B2",
            "positive_color": "#E69F00",
            "max_at_least": 1,
            "min_at_least": -1,
            "antibody_list": {
                "REGN10933": "REGN10933", "S2M11": "S2M-11",
            },
        }
    },
    "func_effects": {
        "spike-mediated infection": {
            "condition": "293T_ACE2_entry",
            "effect_type": "func_effects",
            "positive_color": "#009E73",
            "negative_color": "#CC79A7",
            "max_at_least": 1,
            "min_at_least": 0,
        }
    },
    "other_assays": {
        "receptor_affinity": {
            "mock receptor affinity": {
                "condition": "pretending_S2M11_is_receptor",
                "stat": "receptor affinity_median",
                "positive_color": "#FF715B",
                "negative_color": "#F3C13A",
                "max_at_least": 1,
                "min_at_least": 0,
            }
        }
    },
}
input_csvs = {
    "antibody_escape REGN10933": "results/antibody_escape/averages/REGN10933_mut_effect.csv",
    "antibody_escape S2M11": "results/antibody_escape/averages/S2M11_mut_effect.csv",
    "func_effects 293T_ACE2_entry": "results/func_effects/averages/293T_ACE2_entry_func_effects.csv",
    "receptor_affinity pretending_S2M11_is_receptor": "results/receptor_affinity/averages/pretending_S2M11_is_receptor_mut_effect.csv",
}
site_numbering_map_csv = "data/site_numbering_map.csv"
chart_faceted = "results/summaries/summary_faceted_nolegend.html"
chart_overlaid = "results/summaries/summary_overlaid_nolegend.html"
csv_file = "results/summaries/summary.csv"

import os
os.chdir("../test_example")

Get the `min_times_seen` and `min_frac_models` filters:

In [4]:
min_times_seen = config["min_times_seen"]
min_frac_models = config["min_frac_models"]
alphabet = polyclonal.alphabets.biochem_order_aas(config["alphabet"])

print(f"Using {min_times_seen=} and {min_frac_models=}")

Using min_times_seen=3 and min_frac_models=1


Read the site numbering map:

In [5]:
site_numbering_map = pd.read_csv(site_numbering_map_csv).rename(
    columns={"reference_site": "site"}
)
site_numbering_map = site_numbering_map[
    [c for c in site_numbering_map.columns if c.endswith("site")] + ["region"]
]

Read the escape data:

In [6]:
escape = {}
for antibody_set, antibody_set_d in config["antibody_escape"].items():
    assert len(antibody_set_d["antibody_list"]) == len(set(antibody_set_d["antibody_list"].values()))
    escape_dfs = []
    for antibody, antibody_name in antibody_set_d["antibody_list"].items():
        csv_file = input_csvs[f"antibody_escape {antibody}"]
        escape_dfs.append(
            pd.read_csv(csv_file).assign(antibody=antibody_name).rename(
                columns={antibody_set_d["stat"]: "escape"}
            )
        )
    escape[antibody_set] = (
        pd.concat(escape_dfs)
        .query("frac_models >= @min_frac_models")
        .query("times_seen >= @min_times_seen")
        .query("(mutant in @alphabet) and (wildtype in @alphabet)")
        .pivot_table(
            index=["epitope", "site", "wildtype", "mutant"],
            columns="antibody",
             values="escape",
        )
        .reset_index()
        .assign(site_mutant=lambda x: x["site"].astype(str) + x["mutant"])
    )
    assert escape[antibody_set]["epitope"].nunique() == 1, "can only have 1 epitope"
    escape[antibody_set] = escape[antibody_set].drop(columns="epitope")

Read other properties (functional effects and measurements from other assays):

In [21]:
other_props = {}

for name, name_d in config["func_effects"].items():
    csv_file = input_csvs[f"func_effects {name_d['condition']}"]
    other_props[name] = (
        pd.read_csv(csv_file)
        .rename(columns={"effect": name})
        .assign(frac_models=lambda x: x["n_selections"] / x["n_selections"].max())
        .query("times_seen >= @min_times_seen")
        .query("frac_models >= @min_frac_models")
        [["site", "wildtype", "mutant", name]]
    )

for assay, assay_d in config["other_assays"].items():
    for name, name_d in assay_d.items():
        assert name not in other_props, f"{name} multiply defined"
        csv_file = input_csvs[f"{assay} {name_d['condition']}"]
        other_props[name] = (
            pd.read_csv(csv_file)
            .rename(columns={name_d["stat"]: name})
            .query("times_seen >= @min_times_seen")
            .query("frac_models >= @min_frac_models")
            [["site", "wildtype", "mutant", name]]
        )

assert not set(other_props).intersection(escape), "multiply defined names"

# add wildtype effects of zero
site_wts = pd.concat([*escape.values(), *other_props.values()])[["site", "wildtype"]].drop_duplicates()
assert len(site_wts) == site_wts["site"].nunique()
for prop in other_props:
    other_props[prop] = (
        pd.concat(
            [
                other_props[prop],
                site_wts.assign(
                    mutant=lambda x: x["wildtype"],
                    **{prop: 0},
                )
            ],
            ignore_index=True,
        )
        .assign(site_mutant=lambda x: x["site"].astype(str) + x["mutant"])
        .merge(site_numbering_map, on="site", validate="many_to_one")
        .query("(mutant in @alphabet) and (wildtype in @alphabet)")
    )
    assert other_props[prop]["site_mutant"].nunique() == len(other_props[prop])

In [16]:
other_props

{'spike-mediated infection':       site wildtype mutant  spike-mediated infection
 7        2        F      L                   0.46800
 25       5        L      C                   0.59830
 32       5        L      M                   0.35050
 35       5        L      R                  -4.39400
 36       5        L      S                  -1.56700
 ...    ...      ...    ...                       ...
 7486  1252        S      R                   0.62180
 7488  1252        S      T                   0.08584
 7489  1252        S      V                  -3.54300
 7491  1252        S      Y                  -0.82370
 7494  1253        *      R                  -0.11020
 
 [1436 rows x 4 columns]}