# Distribution of functional effects
Get input variables

In [None]:
# this cell tagged as `parameters` for `papermill` parameterization
xbb15_func_effects_csv = None
ba2_func_effects_csv = None
site_numbering_map_csv = None
init_min_times_seen = None
init_min_n_libraries = None

In [None]:
import altair as alt

import numpy

import pandas as pd

import scipy.stats

_ = alt.data_transformers.disable_max_rows()

In [None]:
xbb15_func_effects = pd.read_csv(xbb15_func_effects_csv)
ba2_func_effects = pd.read_csv(ba2_func_effects_csv)
site_numbering_map = pd.read_csv(site_numbering_map_csv)

func_effects = pd.concat(
    [
        xbb15_func_effects.assign(strain="XBB.1.5"),
        ba2_func_effects.assign(strain="BA.2"),
    ]
).merge(
    site_numbering_map.rename(columns={"reference_site": "site"})[["site", "region"]]
)

# Plot distribution of functional effects for different domains

Make box plots:

In [None]:
dist_df = (
    func_effects
    .query("wildtype != mutant")
    .query("wildtype != '*'")
    .assign(
        mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"],
        mut_type=lambda x: numpy.where(
            x["mutant"] == "*",
            "stop codon",
            numpy.where(x["mutant"] == "-", "deletion", "substitution"),
        ),
        times_seen=lambda x: x["times_seen"].astype(int),
    )
    .rename(columns={"n_selections": "n_libraries"})
    [["strain", "mutation", "effect", "mut_type", "region", "times_seen", "n_libraries"]]
)

times_seen_slider = alt.param(
    value=init_min_times_seen,
    bind=alt.binding_range(
        name="minimum times seen",
        min=1,
        step=0.5,
        max=min(10, dist_df["times_seen"].max()),
    ),
)

n_libraries_slider = alt.param(
    value=init_min_n_libraries,
    bind=alt.binding_range(
        name="minimum number of libraries",
        min=1,
        step=1,
        max=dist_df["n_libraries"].max(),
    ),
)

effect_floor_slider = alt.param(
    value=dist_df["effect"].min(),
    bind=alt.binding_range(
        name="mutation effect floor (clip values < this)",
        min=dist_df["effect"].min(),
        max=0,
    ),
)

n_mutations_slider = alt.param(
    value=1,
    bind=alt.binding_range(
        name="mutation number of mutations to show category",
        min=1,
        max=50,
    ),
)

dist_boxplot = (
    alt.Chart(dist_df)
    .transform_filter(alt.datum["times_seen"] > times_seen_slider)
    .transform_filter(alt.datum["n_libraries"] >= n_libraries_slider)
    .transform_calculate(
        effect_floored=alt.expr.max(alt.datum["effect"], effect_floor_slider),
    )
    .transform_joinaggregate(
        n_mutations="count()",
        groupby=["region", "strain", "mut_type"],
    )
    .transform_filter(alt.datum["n_mutations"] >= n_mutations_slider)
    .encode(
        x=alt.X(
            "mut_type",
            title=None,
            axis=alt.Axis(labelFontSize=11, labelFontStyle="bold", labelAngle=0),
            scale=alt.Scale(domain=["substitution", "stop codon", "deletion"]),
        ),
        y=alt.Y(
            "effect_floored:Q",
            title="mutation effect on cell entry",
            scale=alt.Scale(nice=False, padding=2),
        ),
        color=alt.Color("region"),
        xOffset=alt.XOffset("region"),
        column=alt.Column(
            "strain",
            title=None,
            sort=["XBB.1.5", "BA.2"],
            header=alt.Header(labelFontSize=13, labelFontStyle="bold", labelPadding=2),
            spacing=2,
        ),
    )
    .mark_boxplot(outliers=False, extent=0.75, size=12)
    .configure_axis(grid=False)
    .add_params(
        times_seen_slider,
        n_libraries_slider,
        effect_floor_slider,
        n_mutations_slider,
    )
    .properties(height=160, width=alt.Step(14))
)

dist_boxplot