# Compare simple difference in functional effects across two conditions

Import Python modules.
We use `polyclonal` for the plotting:

In [1]:
import altair as alt

import dms_variants.utils

import pandas as pd

import polyclonal
import polyclonal.plot

This notebook is parameterized by `papermill`.
The next cell is tagged as `parameters` to get the passed parameters.

In [2]:
# this cell is tagged parameters for `papermill` parameterization
site_numbering_map_csv = None
diffs_csv = None
chart_html = None
params = None

In [3]:
# Parameters
params = {
    "condition_1": {
        "name": 220210,
        "selections": ["LibA-220210-293T_ACE2-1", "LibA-220210-293T_ACE2-2"],
    },
    "condition_2": {
        "name": 220302,
        "selections": ["LibA-220302-293T_ACE2-1", "LibA-220302-293T_ACE2-2"],
    },
    "avg_method": "median",
    "per_selection_tooltips": True,
    "plot_kwargs": {
        "addtl_slider_stats": {
            "times_seen": 3,
            "difference_std": 2,
            "fraction_pairs_w_mutation": 1,
            "min_best_effect": -2,
        },
        "addtl_slider_stats_hide_not_filter": ["min_best_effect"],
        "addtl_slider_stats_as_max": ["difference_std"],
        "heatmap_max_at_least": 1,
        "heatmap_min_at_least": -1,
        "init_floor_at_zero": False,
        "init_site_statistic": "abs_mean",
        "site_zoom_bar_color_col": "region",
        "slider_binding_range_kwargs": {"times_seen": {"step": 1, "min": 1, "max": 25}},
    },
    "title": "Differences in mutation effects on 293T entry across days",
    "legend": "LEGEND\n",
}
site_numbering_map_csv = "data/site_numbering_map.csv"
diffs_csv = "results/func_effect_diffs/220210_vs_220302_comparison_diffs.csv"
chart_html = "results/func_effect_diffs/220210_vs_220302_comparison_diffs_nolegend.html"

import os
os.chdir("../test_example")

Read the input data:

In [13]:
site_numbering_map = pd.read_csv(site_numbering_map_csv).rename(
    columns={"reference_site": "site"}
)
assert site_numbering_map[["site", "sequential_site"]].notnull().all().all()
addtl_site_cols = [
    c for c in site_numbering_map.columns if c != "site" and c.endswith("site")
]

condition_1 = params["condition_1"]["name"]
condition_2 = params["condition_2"]["name"]
assert condition_1 != condition_2, f"{condition_1=}, {condition_2=}"
condition_1_selections = params["condition_1"]["selections"]
condition_2_selections = params["condition_2"]["selections"]
assert len(condition_1_selections), params["condition_1"]
assert len(condition_2_selections), params["condition_2"]
if set(condition_1_selections).intersection(condition_2_selections):
    raise ValueError(
        f"shared selections in {condition_1_selections=} and {condition_2_selections=}"
    )

dfs = []
for c, sels in [
    (condition_1, condition_1_selections),
    (condition_2, condition_2_selections),
]:
    for s in sels:
        dfs.append(
            pd.read_csv(f"results/func_effects/by_selection/{s}_func_effects.csv")
            .assign(
                selection=s,
                condition=c,
                times_seen=lambda x: x["times_seen"].astype("Int64"),
                mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"],
            )
        )
func_effects = pd.concat(dfs, ignore_index=True)

## Correlations among all selections
Compute the correlations in the mutation effects across all selections:

In [22]:
# We compute for several times seen values, get those:
try:
    init_times_seen = params["plot_kwargs"]["addtl_slider_stats"]["times_seen"]
except KeyError:
    print("No times seen in params, using a value of 3")
    init_times_seen = 3

# do analysis for each "times_seen"
func_effects_for_corr = pd.concat(
    [
        func_effects.query("times_seen >= @t", engine="python").assign(min_times_seen=t)
        for t in [1, init_times_seen, 2 * init_times_seen]
    ]
)

corrs = (
    dms_variants.utils.tidy_to_corr(
        df=func_effects_for_corr,
        sample_col="selection",
        label_col="mutation",
        value_col="functional_effect",
        group_cols=["min_times_seen"],
    )
    .assign(
        r2=lambda x: x["correlation"] ** 2,
        min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
    )
    .rename(columns={"correlation": "r"})
)

corr_chart = (
    alt.Chart(corrs)
    .encode(
        alt.X("selection_1", title=None),
        alt.Y("selection_2", title=None),
        column=alt.Column("min_times_seen", title=None),
        color=alt.Color("r2", scale=alt.Scale(zero=True)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if c in {"r2", "r"} else c
            for c in ["selection_1", "selection_2", "r2", "r"]
        ],
    )
    .mark_rect(stroke="black")
    .properties(
        width=alt.Step(15),
        height=alt.Step(15),
        title="Per-selection correlation in mutation functional effects",
    )
    .configure_axis(labelLimit=500)
)

display(corr_chart)

print(
    f"\nSelections for {condition_1}: {condition_1_selections}\n"
    f"Selections for {condition_2}: {condition_2_selections}\n"
)


Selections for 220210: ['LibA-220210-293T_ACE2-1', 'LibA-220210-293T_ACE2-2']
Selections for 220302: ['LibA-220302-293T_ACE2-1', 'LibA-220302-293T_ACE2-2']



## Average functional effects for each condition
Average the functional effects for each condition using the specified averaging method, then print the correlation between these average functional effects at several times seen:

In [30]:
avg_method = params["avg_method"]
assert avg_method in {"mean", "median"}, avg_method

avg_func_effects = (
    func_effects
    .groupby(["condition", "site", "wildtype", "mutant", "mutation"], as_index=False)
    .aggregate(
        effect=pd.NamedAgg("functional_effect", avg_method),
        times_seen=pd.NamedAgg("times_seen", "sum"),
        n_selections=pd.NamedAgg("site", "count"),
    )
    .assign(
        times_seen=lambda x: (x["times_seen"] / x["n_selections"]).where(
            x["mutant"] != x["wildtype"],
            pd.NA,
        )
    )
)

avg_func_effects_for_corr = pd.concat(
    [
        avg_func_effects.query("times_seen >= @t", engine="python").assign(min_times_seen=t)
        for t in [1, init_times_seen, 2 * init_times_seen]
    ]
)
print("Correlation between average functional effects across conditions:")
display(
    dms_variants.utils.tidy_to_corr(
        df=avg_func_effects_for_corr,
        sample_col="condition",
        label_col="mutation",
        value_col="effect",
        group_cols=["min_times_seen"],
    )
    .assign(
        r2=lambda x: x["correlation"] ** 2,
        min_times_seen=lambda x: "min times seen " + x["min_times_seen"].astype(str)
    )
    .rename(columns={"correlation": "r"})
    .query("condition_1 != condition_2")
    .reset_index(drop=True)
    .groupby("min_times_seen")
    .first()
    .round(3)
)

Correlation between average functional effects across conditions:


Unnamed: 0_level_0,condition_1,condition_2,r,r2
min_times_seen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
min times seen 1,220302,220210,0.867,0.752
min times seen 3,220302,220210,0.879,0.773
min times seen 6,220302,220210,0.848,0.718
