# Fit global epistasis models to functional scores for each selection to get mutation functional effects

Import Python modules.
We use `multidms` for the fitting:

In [None]:
import dms_variants.codonvarianttable

import matplotlib.pyplot as plt

import multidms

import numpy

import pandas as pd

This notebook is parameterized by `papermill`.
The next cell is tagged as `parameters` to get the passed parameters.

In [None]:
# this cell is tagged parameters for `papermill` parameterization
selection = None
func_scores = None
func_effects = None
global_epistasis_params = None
threads = None

Read and clip functional scores:

In [None]:
func_scores_df = (
    pd.read_csv(func_scores, na_filter=None)
    .assign(condition=selection)
    .pipe(dms_variants.codonvarianttable.CodonVariantTable.classifyVariants)
)

median_stop = func_scores_df.query("variant_class == 'stop'")["func_score"].median()

for bound in ["upper", "lower"]:
    clip = global_epistasis_params[f"clip_{bound}"]
    if clip is None:
        print(f"No clipping on {bound} bound of functional scores")
    else:
        if clip == "median_stop":
            if pd.isnull(median_stop):
                raise ValueError(f"{median_stop=}")
            clip = median_stop
        assert isinstance(clip, (int, float)), clip
        print(f"Clipping {bound} bound of functional scores to {clip}")
        func_scores_df["func_score"] = func_scores_df["func_score"].clip(
            **{bound: clip}
        )

Initialize the data for `multidms`:

In [None]:
data = multidms.MultiDmsData(
    variants_df=func_scores_df,
    reference=selection,
    alphabet=multidms.AAS_WITHSTOP_WITHGAP,
    collapse_identical_variants=global_epistasis_params["collapse_identical_variants"],
    letter_suffixed_sites=True,
    verbose=True,
    nb_workers=threads,
)

Now initialize the `multidms` model and fit it:

In [None]:
# initialize with default params, which give sigmoid global epistasis function
model = multidms.MultiDmsModel(data)

model.fit()

Look at accuracy of predictionss and the global epistasis fit:

In [None]:
fig, ax = plt.subplots(1, 2, figsize=[8, 4])
model.plot_epistasis(ax=ax[1], alpha=0.1, show=False, legend=False)
model.plot_pred_accuracy(ax=ax[0], alpha=0.1, show=False, legend=False)
ax[1].set_title("Global epistasis fit")
ax[0].set_title("Training set accuracy")
plt.show()

Plot the distribution of latent phenotype functional scores with a few different cutoffs on `times_seen` (the number of variants in which a mutaiton is seen):

In [None]:
fig, axes = plt.subplots(3, 1, figsize=[7, 8])
for times_seen, ax in zip([1, 3, 5], axes):
    model.plot_param_hist("β", ax=ax, show=False, times_seen_threshold=times_seen)
    ax.legend()
    ax.set_title(
        f"Latent-phenotype effects of mutations with times_seen >= {times_seen}"
    )
plt.tight_layout()
plt.show()

Compute the effect of the mutation on the observed phenotype of the functional score, which we simply call the "functional effect" of the mutation:

In [None]:
# The below code is designed to get the prediction of how each mutation affects the
# functional score. It is complex, because `multidms` currently lacks any method for
# accessing the predicted functional scores for anything other than the training set,
# and because the wildtype functional score may not be zero.
# The below code WILL ONLY WORK for sigmoid epistasis and just the reference condition.
mut_effects = (
    # get mutation effects but add wildtype, which is not included in `multidms` output
    pd.concat(
        [
            (
                model.mutations_df.rename(
                    columns={"β": "latent_phenotype_effect"}
                ).drop(  # fix bizarre name
                    columns="mutation"
                )  # not needed for this code
            ),
            pd.DataFrame(
                {
                    "sites": data.site_map.index,
                    "wts": data.site_map[selection],
                    "muts": data.site_map[selection],
                    "latent_phenotype_effect": 0,
                }
            ),
        ],
        ignore_index=True,
    )
    .sort_values(["sites", "muts"])
    .reset_index(drop=True)
    # rename columns to names that are clearer for this analysis
    .rename(
        columns={
            f"times_seen_{selection}": "times_seen",
            "wts": "wildtype",
            "sites": "site",
            "muts": "mutant",
        }
    )
    # Compute the predicted functional score for each mutation. This code is a hack to
    # temporarily fix the fact that `multidms` does not enable prediction of functional
    # scores, and ONLY WORKS for sigmoid epistasis on the reference condition.
    .assign(
        # latent phenotype differs from latent phenotype effect since wildtype != 0
        latent_phenotype=lambda x: x["latent_phenotype_effect"] + model.params["C_ref"],
        # get predicted functional score of each mutant
        predicted_functional_score=lambda x: (
            model.params["α"]["ge_scale"] / (1 + numpy.exp(-x["latent_phenotype"]))
            + model.params["α"]["ge_bias"]
        ),
        # the functional effect is predicted functional score minus predicted score for
        # wildtype, which is != 0
        functional_effect=lambda x: (
            x["predicted_functional_score"]
            - model.params["α"]["ge_scale"] / (1 + numpy.exp(-model.params["C_ref"]))
            - model.params["α"]["ge_bias"]
        ),
    )
    # drop the phenotypes themselves, as we care about the effects of mutations
    .drop(columns=["latent_phenotype", "predicted_functional_score"])
)

mut_effects

Write the mutational effects to a file:

In [None]:
print(f"Writing the mutational effects to {func_effects}")

mut_effects.to_csv(func_effects, index=False, float_format="%.4g")