# Fit polyclonal model
Here we fit [polyclonal](https://jbloomlab.github.io/polyclonal) models to the data.

First, import Python modules:

In [1]:
import pickle

import altair as alt

import pandas as pd

import polyclonal

import yaml

import polyclonal.pdb_utils

In [2]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

In [3]:
import os
os.chdir('../../')

## Read input data

Get parameterized variable from [papermill](https://papermill.readthedocs.io/)

In [4]:
prob_escape_files = {
    "libA": "results/prob_escape/libA_221027_1_AUSAB-13_1_prob_escape.csv",
    "libB": "results/prob_escape/libB_221108_1_AUSAB-13_1_prob_escape.csv",
}

prob_escape = pd.concat(
    [
        pd.read_csv(f, keep_default_na=False, na_values="nan").assign(library=lib)
        for lib, f in prob_escape_files.items()
    ],
    ignore_index=True,
).query("`no-antibody_count` >= no_antibody_count_threshold")

assert prob_escape.notnull().all().all()


In [5]:
display(
    prob_escape.groupby(["library", "antibody_concentration"]).aggregate(
        n_variants=pd.NamedAgg("barcode", "nunique")
    )
)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_variants
library,antibody_concentration,Unnamed: 2_level_1
libA,0.0007,26636
libA,0.001,26636
libA,0.0015,26636
libA,0.0022,26636
libA,0.0033,26636
libA,0.005,26636
libA,0.0075,26636
libB,0.0007,28219
libB,0.001,28219
libB,0.0015,28219


Plot mean probability of escape across all variants with the indicated number of mutations.
Note that this plot weights each variant the same in the means regardless of how many barcode counts it has.
We plot means for both censored (set to between 0 and 1) and uncensored probabilities of escape.
Also, note it uses a symlog scale for the y-axis.
Mouseover points for values:

In [6]:
# NBVAL_IGNORE_OUTPUT
max_aa_subs = 4  # group if >= this many substitutions

mean_prob_escape = (
    prob_escape.assign(
        n_subs=lambda x: (
            x["aa_substitutions_sequential"]
            .str.split()
            .map(len)
            .clip(upper=max_aa_subs)
            .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
        )
    )
    .groupby(["antibody_concentration", "library", "n_subs"], as_index=False)
    .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
    .rename(
        columns={
            "prob_escape": "censored to [0, 1]",
            "prob_escape_uncensored": "not censored",
        }
    )
    .melt(
        id_vars=["antibody_concentration", "n_subs", "library"],
        var_name="censored",
        value_name="probability escape",
    )
)

mean_prob_escape_chart = (
    alt.Chart(mean_prob_escape)
    .encode(
        x=alt.X("antibody_concentration"),
        y=alt.Y(
            "probability escape",
            scale=alt.Scale(type="symlog", constant=0.05),
        ),
        column=alt.Column("censored", title=None),
        row=alt.Row("library", title=None),
        color=alt.Color("n_subs", title="n substitutions"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
            for c in mean_prob_escape.columns
        ],
    )
    .mark_line(point=True, size=0.5)
    .properties(width=200, height=125)
    .configure_axis(grid=False)
)

mean_prob_escape_chart

  for col_name, dtype in df.dtypes.iteritems():


In [13]:
spatial_distances = polyclonal.pdb_utils.inter_residue_distances(
    "scratch_notebooks/221111_model-fitting/4o5n.pdb",
    target_chains=["A", "B"],
)

spatial_distances

Unnamed: 0,site_1,site_2,distance,chain_1,chain_2
0,9,10,1.328212,A,A
1,9,11,3.469929,B,B
2,9,12,6.336130,B,B
3,9,13,9.189821,B,B
4,9,14,8.930696,B,A
...,...,...,...,...,...
260276,497,499,15.936294,B,B
260277,497,500,16.632641,B,B
260278,498,499,23.859705,B,B
260279,498,500,13.285421,B,B


In [15]:
# NBVAL_IGNORE_OUTPUT

reference_sites = pd.read_csv("data/site_map.csv")["reference_site"].tolist()

for library, df in prob_escape.groupby("library"):

    print(f"\n\nFitting for {library}")

    model = polyclonal.Polyclonal(
        n_epitopes=2,
        data_to_fit=df.rename(
            columns={
                "antibody_concentration": "concentration",
                "aa_substitutions_reference": "aa_substitutions",
            }
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
        sites=reference_sites,
        spatial_distances=spatial_distances,
    )

    # fit model
    opt_res = model.fit(
        logfreq=200,
        reg_activity_weight = 1,
        reg_escape_weight=0.1,
#         reg_uniqueness_weight=0,
#         reg_uniqueness2_weight=1.5,
        reg_spatial_weight=0.0,
        reg_spatial2_weight=0.001,
    )

    # display results
    display(model.activity_wt_barplot())
    display(model.mut_escape_plot(addtl_slider_stats={"times_seen": 3}))



Fitting for libA
# First fitting site-level model.
# Starting optimization of 1006 parameters at Fri Nov 18 15:23:05 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0    0.075338  1.6416e+05  1.6415e+05           0           0           0              0               0       11.001
         107      9.6977      850.51      840.37      1.5136           0      3.3514              0          0.1297       5.1434
# Successfully finished at Fri Nov 18 15:23:15 2022.
# Starting optimization of 6474 parameters at Fri Nov 18 15:23:15 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.10045      1084.5      1008.5      16.794  1.9523e-32      3.3514              0          50.714       5.1434
          95      10.736       953.1      925.07      13.249      1.6715      7.7084              0      

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():




Fitting for libB
# First fitting site-level model.
# Starting optimization of 1040 parameters at Fri Nov 18 15:23:33 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0    0.071801  1.6944e+05  1.6943e+05           0           0           0              0               0       11.001
         129       11.54      1963.9      1948.6      2.1215           0      9.5476              0        0.067225       3.4965
# Successfully finished at Fri Nov 18 15:23:45 2022.
# Starting optimization of 6626 parameters at Fri Nov 18 15:23:45 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.10715      2702.9      2656.4      22.554  5.8782e-32      9.5476              0          10.939       3.4965
         200      22.743      2503.9      2451.2      22.736      1.5107      21.067              0      

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


In [12]:
# NBVAL_IGNORE_OUTPUT

reference_sites = pd.read_csv("data/site_map.csv")["reference_site"].tolist()

for library, df in prob_escape.groupby("library"):

    print(f"\n\nFitting for {library}")

    model = polyclonal.Polyclonal(
        n_epitopes=2,
        data_to_fit=df.rename(
            columns={
                "antibody_concentration": "concentration",
                "aa_substitutions_reference": "aa_substitutions",
            }
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
        sites=reference_sites,
        spatial_distances=spatial_distances,
    )

    # fit model
    opt_res = model.fit(
        logfreq=200,
        reg_activity_weight = 1,
        reg_escape_weight=0.1,
#         reg_uniqueness_weight=0,
#         reg_uniqueness2_weight=1.5,
        reg_spatial_weight=0.0,
        reg_spatial2_weight=0.0001,
    )

    # display results
    display(model.activity_wt_barplot())
    display(model.mut_escape_plot(addtl_slider_stats={"times_seen": 3}))



Fitting for libA
# First fitting site-level model.
# Starting optimization of 1006 parameters at Fri Nov 18 15:17:52 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0    0.082452  1.6416e+05  1.6415e+05           0           0           0              0               0       11.001
         151      13.393      838.32      821.33      7.2312           0      3.9371              0         0.79608       5.0265
# Successfully finished at Fri Nov 18 15:18:06 2022.
# Starting optimization of 6474 parameters at Fri Nov 18 15:18:06 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.10801      1285.7      991.98      81.558  2.5005e-31      3.9371              0          203.22       5.0265
         200      22.998      936.99      904.45       22.52      2.3335      2.0316              0      

  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():




Fitting for libB
# First fitting site-level model.
# Starting optimization of 1040 parameters at Fri Nov 18 15:18:38 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0    0.081087  1.6944e+05  1.6943e+05           0           0           0              0               0       11.001


  exp_minus_phi_e_v = numpy.exp(-phi_e_v)


         200      17.799      1931.3      1911.2      9.2249           0      6.3948              0         0.99001       3.4345
         244      21.193      1931.2      1911.3      9.1649           0      6.3773              0         0.93111       3.4342
# Successfully finished at Fri Nov 18 15:18:59 2022.
# Starting optimization of 6626 parameters at Fri Nov 18 15:18:59 2022.
        step    time_sec        loss    fit_loss  reg_escape  reg_spread reg_spatial reg_uniqueness reg_uniqueness2 reg_activity
           0     0.11093        2786      2628.4      77.128  1.1399e-31      6.3773              0          70.623       3.4342
         200      23.541      2457.9      2395.5      42.516      2.7849        7.66              0          5.7494       3.6664
         304      35.441      2455.1      2391.5      43.372      2.8605      7.9246              0          5.7807       3.6661
# Successfully finished at Fri Nov 18 15:19:35 2022.


  for col_name, dtype in df.dtypes.iteritems():


  for col_name, dtype in df.dtypes.iteritems():


## Fit `polyclonal` model
First, get the fitting related keyword arguments from the configuration passed by `snakemake`:

In [14]:
max_epitopes = antibody_config["max_epitopes"]
print(f"{max_epitopes=}")

fit_kwargs = antibody_config["fit_kwargs"]
print(f"{fit_kwargs=}")

min_epitope_activity_to_include = antibody_config["min_epitope_activity_to_include"]
print(f"{min_epitope_activity_to_include=}")

max_epitopes=1
fit_kwargs={'reg_escape_weight': 0.1, 'reg_spread_weight': 0.25, 'reg_activity_weight': 1.0}
min_epitope_activity_to_include=0.2


Fit a model to all the data, and keep adding epitopes until we either reach the maximum specified or the new epitope has negative activity.
Note that that we fit using the **reference** based-site-numbering scheme, so results are shown with those numbers:Z

In [15]:
models = []

for n_epitopes in range(1, max_epitopes + 1):
    print(f"\nFitting model with {n_epitopes=}")

    # create model
    model = polyclonal.Polyclonal(
        n_epitopes=n_epitopes,
        data_to_fit=prob_escape.rename(
            columns={
                "antibody_concentration": "concentration",
                "aa_substitutions_reference": "aa_substitutions",
            }
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
        sites=reference_sites,
    )

    # fit model
    opt_res = model.fit(logfreq=200, **fit_kwargs)

    # display activities
    print("Activities of epitopes:")
    display(model.activity_wt_df.round(1))
    print("Max and mean absolute-value escape at each epitope:")
    display(
        model.mut_escape_df.groupby("epitope")
        .aggregate(
            max_escape=pd.NamedAgg("escape", "max"),
            mean_abs_escape=pd.NamedAgg("escape", lambda s: s.abs().mean()),
        )
        .round(1)
    )

    # stop if activity below threshold for any epitope and fit at least one epitope
    if len(models) and any(
        model.activity_wt_df["activity"] <= min_epitope_activity_to_include
    ):
        print(f"Stop fitting, epitope has activity <={min_epitope_activity_to_include}")
        models.append(model)
        model = models[-2]  # get previous model
        break
    else:
        models.append(model)

print(f"\nThe selected model has {len(model.epitopes)} epitopes")


Fitting model with n_epitopes=1
# First fitting site-level model.
# Starting optimization of 520 parameters at Tue Nov 15 10:53:15 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.037375       7254.4       7253.4            0            0            0      0.90499
           56       1.9295       746.97       737.46       6.9633            0            0        2.544
# Successfully finished at Tue Nov 15 10:53:16 2022.
# Starting optimization of 3313 parameters at Tue Nov 15 10:53:17 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.039724       1100.1       1035.8       61.734   1.1225e-31            0        2.544
           70       2.8128       1027.6       1006.5       16.741        1.176            0       3.1024
# Successfully finished at Tue Nov 15 10:53:19 2022.
Activities of epitopes:


Unnamed: 0,epitope,activity
0,1,3.2


Max and mean absolute-value escape at each epitope:


Unnamed: 0_level_0,max_escape,mean_abs_escape
epitope,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.2,0.1



The selected model has 1 epitopes


Epitope activities:

In [16]:
model.activity_wt_barplot()

  for col_name, dtype in df.dtypes.iteritems():


Plot of escape values:

In [17]:
df_to_merge = site_numbering_map.rename(columns={"reference_site": "site"})

plot_kwargs = antibody_config["plot_kwargs"]
if "plot_title" not in plot_kwargs:
    plot_kwargs["plot_title"] = str(antibody)
if "region" in site_numbering_map:
    plot_kwargs["site_zoom_bar_color_col"] = "region"
if "addtl_slider_stats" not in plot_kwargs:
    plot_kwargs["addtl_slider_stats"] = {"times_seen": 1}
elif "times_seen" not in plot_kwargs["addtl_slider_stats"]:
    plot_kwargs["addtl_slider_stats"]["times_seen"] = 1
if "functional effect" in plot_kwargs["addtl_slider_stats"]:
    del plot_kwargs["addtl_slider_stats"]["functional effect"]  # only antibody averages
if any(site_numbering_map["sequential_site"] != site_numbering_map["reference_site"]):
    if "addtl_tooltip_stats" not in plot_kwargs:
        plot_kwargs["addtl_tooltip_stats"] = ["sequential_site"]
    else:
        plot_kwargs["addtl_tooltip_stats"].append("sequential_site")

model.mut_escape_plot(df_to_merge=df_to_merge, **plot_kwargs)

  for col_name, dtype in df.dtypes.iteritems():


In [18]:
# Parameters
old_prob_escape_csv = "scratch_notebooks/221108_pre-config-change_prob-escapes/libB_221108_1_1C04-5G04_1_prob_escape.csv"
# pickle_file = "results/polyclonal_fits/libB_221108_1_1C04-5G04_1.pickle"
n_threads = 2


In [19]:
print(f"\nReading probabilities of escape from {prob_escape_csv}")

old_prob_escape = pd.read_csv(
    old_prob_escape_csv, keep_default_na=False, na_values="nan"
).query("`no-antibody_count` >= no_antibody_count_threshold")
assert old_prob_escape.notnull().all().all()


Reading probabilities of escape from results/prob_escape/libB_221108_1_1C04-5G04_1_prob_escape.csv


In [21]:
display(
    old_prob_escape.groupby("antibody_concentration").aggregate(
        n_variants=pd.NamedAgg("barcode", "nunique")
    )
)

Unnamed: 0_level_0,n_variants
antibody_concentration,Unnamed: 1_level_1
1.37,31045
2.05,31045
3.08,31045
4.62,31045
6.93,31045
10.4,31045


In [27]:
max_aa_subs = 4  # group if >= this many substitutions

mean_prob_escape = (
    prob_escape.assign(
        n_subs=lambda x: (
            x["aa_substitutions_reference"]
            .str.split()
            .map(len)
            .clip(upper=max_aa_subs)
            .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
        )
    )
    .groupby(["antibody_concentration", "n_subs"], as_index=False)
    .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
    .rename(
        columns={
            "prob_escape": "censored to [0, 1]",
            "prob_escape_uncensored": "not censored",
        }
    )
    .melt(
        id_vars=["antibody_concentration", "n_subs"],
        var_name="censored",
        value_name="probability escape",
    )
)

mean_prob_escape_chart = (
    alt.Chart(mean_prob_escape)
    .encode(
        x=alt.X("antibody_concentration"),
        y=alt.Y(
            "probability escape",
            scale=alt.Scale(type="symlog", constant=0.05),
        ),
        column=alt.Column("censored", title=None),
        color=alt.Color("n_subs", title="n substitutions"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
            for c in mean_prob_escape.columns
        ],
    )
    .mark_line(point=True, size=0.5)
    .properties(width=200, height=125)
    .configure_axis(grid=False)
)

mean_prob_escape_chart

  for col_name, dtype in df.dtypes.iteritems():


In [23]:
max_aa_subs = 4  # group if >= this many substitutions

mean_prob_escape = (
    old_prob_escape.assign(
        n_subs=lambda x: (
            x["aa_substitutions_reference"]
            .str.split()
            .map(len)
            .clip(upper=max_aa_subs)
            .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
        )
    )
    .groupby(["antibody_concentration", "n_subs"], as_index=False)
    .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
    .rename(
        columns={
            "prob_escape": "censored to [0, 1]",
            "prob_escape_uncensored": "not censored",
        }
    )
    .melt(
        id_vars=["antibody_concentration", "n_subs"],
        var_name="censored",
        value_name="probability escape",
    )
)

mean_prob_escape_chart = (
    alt.Chart(mean_prob_escape)
    .encode(
        x=alt.X("antibody_concentration"),
        y=alt.Y(
            "probability escape",
            scale=alt.Scale(type="symlog", constant=0.05),
        ),
        column=alt.Column("censored", title=None),
        color=alt.Color("n_subs", title="n substitutions"),
        tooltip=[
            alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
            for c in mean_prob_escape.columns
        ],
    )
    .mark_line(point=True, size=0.5)
    .properties(width=200, height=125)
    .configure_axis(grid=False)
)

mean_prob_escape_chart

  for col_name, dtype in df.dtypes.iteritems():


In [19]:
model = polyclonal.Polyclonal(
    n_epitopes=2,
    data_to_fit=prob_escape.rename(
        columns={
            "antibody_concentration": "concentration",
            "aa_substitutions_reference": "aa_substitutions",
        }
    ).query("concentration > 1.0"),
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

# fit model
opt_res = model.fit(
    logfreq=200,
    reg_escape_weight=0.1,
    reg_spread_weight=0.25,
    reg_activity_weight=1.0,
    reg_similarity_weight=0.3,  # regularize epitope similarity
)
model.mut_escape_plot()

# First fitting site-level model.
# Starting optimization of 1040 parameters at Tue Nov 15 10:54:21 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.068292       1193.1       1192.2            0            0            0      0.90499
          186       10.853       683.14       674.04       7.6229            0      0.12071       1.3598
# Successfully finished at Tue Nov 15 10:54:32 2022.
# Starting optimization of 6626 parameters at Tue Nov 15 10:54:32 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.071876       1075.1       994.96       66.197   2.2473e-31       12.567       1.3598
           96       7.3056       983.97       958.38       21.163       1.4684      0.98131       1.9734
# Successfully finished at Tue Nov 15 10:54:39 2022.


  for col_name, dtype in df.dtypes.iteritems():


In [30]:
model = polyclonal.Polyclonal(
    n_epitopes=1,
    data_to_fit=old_prob_escape.rename(
        columns={
            "antibody_concentration": "concentration",
            "aa_substitutions_reference": "aa_substitutions",
        }
    ).query("concentration > 1.0"),
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

# fit model
opt_res = model.fit(
    logfreq=200,
    reg_escape_weight=0.1,
    reg_spread_weight=0.25,
    reg_activity_weight=1.0,
#     reg_similarity_weight=0.2,  # regularize epitope similarity
)
model.mut_escape_plot()

# First fitting site-level model.
# Starting optimization of 521 parameters at Fri Nov 11 18:03:27 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.031831         8058       8057.1            0            0            0      0.90499
           62       2.0586       842.18       832.01       7.6178            0            0        2.552
# Successfully finished at Fri Nov 11 18:03:29 2022.
# Starting optimization of 3363 parameters at Fri Nov 11 18:03:29 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0      0.03656       1240.9       1171.9        66.37   1.1674e-31            0        2.552
           62       2.5211       1161.7       1140.8       16.619       1.1369            0       3.1562
# Successfully finished at Fri Nov 11 18:03:32 2022.


  for col_name, dtype in df.dtypes.iteritems():


In [35]:
prob_escape = prob_escape.loc[prob_escape['antibody_concentration'] != 10.40]

display(
    prob_escape.groupby("antibody_concentration").aggregate(
        n_variants=pd.NamedAgg("barcode", "nunique")
    )
)

Unnamed: 0_level_0,n_variants
antibody_concentration,Unnamed: 1_level_1
1.37,28219
2.05,28219
3.08,28219
4.62,28219
6.93,28219


In [36]:
model = polyclonal.Polyclonal(
    n_epitopes=1,
    data_to_fit=prob_escape.rename(
        columns={
            "antibody_concentration": "concentration",
            "aa_substitutions_reference": "aa_substitutions",
        }
    ).query("concentration > 1.0"),
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

# fit model
opt_res = model.fit(
    logfreq=200,
    reg_escape_weight=0.1,
    reg_spread_weight=0.25,
    reg_activity_weight=1.0,
#     reg_similarity_weight=0.2,  # regularize epitope similarity
)
model.mut_escape_plot()

# First fitting site-level model.
# Starting optimization of 520 parameters at Fri Nov 11 18:09:16 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.026342       7102.1       7101.2            0            0            0      0.90499
           59       1.5312       744.64       735.05       7.0552            0            0       2.5325
# Successfully finished at Fri Nov 11 18:09:18 2022.
# Starting optimization of 3313 parameters at Fri Nov 11 18:09:18 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.029789       1099.3       1034.4       62.362   1.6394e-31            0       2.5325
           65       2.1113       1025.6         1004       17.253       1.2312            0       3.0914
# Successfully finished at Fri Nov 11 18:09:20 2022.


  for col_name, dtype in df.dtypes.iteritems():


In [40]:
# Parameters
prob_escape_csv = "results/prob_escape/libB_221108_1_3x-1C04_5G04_1_prob_escape.csv"
# pickle_file = "results/polyclonal_fits/libB_221108_1_1C04-5G04_1.pickle"
n_threads = 2


In [41]:
print(f"\nReading probabilities of escape from {prob_escape_csv}")

prob_escape = pd.read_csv(
    prob_escape_csv, keep_default_na=False, na_values="nan"
).query("`no-antibody_count` >= no_antibody_count_threshold")
assert prob_escape.notnull().all().all()


Reading probabilities of escape from results/prob_escape/libB_221108_1_3x-1C04_5G04_1_prob_escape.csv


In [42]:
# get information from config
with open("config.yaml") as f:
    config = yaml.safe_load(f)

antibody = prob_escape["antibody"].unique()
assert len(antibody) == 1, antibody
antibody = antibody[0]

# get site numbering map and the reference sites in order
site_numbering_map = pd.read_csv(config["site_numbering_map"])
reference_sites = site_numbering_map.sort_values("sequential_site")[
    "reference_site"
].tolist()

# get the polyclonal configuration for this antibody
with open(config["polyclonal_config"]) as f:
    polyclonal_config = yaml.safe_load(f)
if antibody not in polyclonal_config:
    raise ValueError(f"`polyclonal_config` lacks configuration for {antibody=}")
antibody_config = polyclonal_config[antibody]

# print names of variables and settings
print(f"{antibody=}")
print(f"{n_threads=}")
print(f"{pickle_file=}")
print(f"{antibody_config=}")

antibody='3x-1C04_5G04'
n_threads=2
pickle_file='results/polyclonal_fits/libB_221108_1_1C04-5G04_1.pickle'
antibody_config={'min_epitope_activity_to_include': 0.2, 'plot_kwargs': {'addtl_slider_stats': {'times_seen': 3, 'functional effect': -1.38}, 'slider_binding_range_kwargs': {'n_models': {'step': 1}, 'times_seen': {'step': 1, 'min': 1, 'max': 25}}, 'heatmap_max_at_least': 2, 'heatmap_min_at_least': -2}, 'max_epitopes': 1, 'fit_kwargs': {'reg_escape_weight': 0.1, 'reg_spread_weight': 0.25, 'reg_activity_weight': 1.0}}


In [43]:
display(
    prob_escape.groupby("antibody_concentration").aggregate(
        n_variants=pd.NamedAgg("barcode", "nunique")
    )
)

Unnamed: 0_level_0,n_variants
antibody_concentration,Unnamed: 1_level_1
0.86,28219
1.29,28219
1.93,28219
2.9,28219
4.35,28219
9.78,28219


In [44]:
prob_escape = prob_escape.loc[prob_escape['antibody_concentration'] != 0.86]

In [45]:
model = polyclonal.Polyclonal(
    n_epitopes=1,
    data_to_fit=prob_escape.rename(
        columns={
            "antibody_concentration": "concentration",
            "aa_substitutions_reference": "aa_substitutions",
        }
    ).query("concentration > 1.0"),
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

# fit model
opt_res = model.fit(
    logfreq=200,
    reg_escape_weight=0.1,
    reg_spread_weight=0.25,
    reg_activity_weight=1.0,
#     reg_similarity_weight=0.2,  # regularize epitope similarity
)
model.mut_escape_plot()

# First fitting site-level model.
# Starting optimization of 520 parameters at Fri Nov 11 18:31:58 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.029149         8381       8380.1            0            0            0      0.90499
           32      0.93758       241.03       234.86       2.4133            0            0       3.7505
# Successfully finished at Fri Nov 11 18:31:59 2022.
# Starting optimization of 3313 parameters at Fri Nov 11 18:31:59 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.030339       377.22       343.13       30.339    3.185e-32            0       3.7505
           37       1.2713        348.4        343.2       0.8619     0.072682            0       4.2671
# Successfully finished at Fri Nov 11 18:32:00 2022.


  for col_name, dtype in df.dtypes.iteritems():


In [46]:
# Parameters
old_prob_escape_csv = "scratch_notebooks/221108_pre-config-change_prob-escapes/libB_221108_1_3x-1C04_5G04_1_prob_escape.csv"
# pickle_file = "results/polyclonal_fits/libB_221108_1_1C04-5G04_1.pickle"
n_threads = 2


In [49]:
print(f"\nReading probabilities of escape from {prob_escape_csv}")

old_prob_escape = pd.read_csv(
    old_prob_escape_csv, keep_default_na=False, na_values="nan"
).query("`no-antibody_count` >= no_antibody_count_threshold")
assert old_prob_escape.notnull().all().all()


Reading probabilities of escape from results/prob_escape/libB_221108_1_3x-1C04_5G04_1_prob_escape.csv


In [52]:
old_prob_escape = old_prob_escape.loc[old_prob_escape['antibody_concentration'] != 0.86]

In [53]:
model = polyclonal.Polyclonal(
    n_epitopes=1,
    data_to_fit=old_prob_escape.rename(
        columns={
            "antibody_concentration": "concentration",
            "aa_substitutions_reference": "aa_substitutions",
        }
    ).query("concentration > 1.0"),
    alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
)

# fit model
opt_res = model.fit(
    logfreq=200,
    reg_escape_weight=0.1,
    reg_spread_weight=0.25,
    reg_activity_weight=1.0,
#     reg_similarity_weight=0.2,  # regularize epitope similarity
)
model.mut_escape_plot()

# First fitting site-level model.
# Starting optimization of 521 parameters at Fri Nov 11 18:34:43 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.029911       9240.2       9239.3            0            0            0      0.90499
           29      0.86552       266.98       260.58       2.6501            0            0       3.7498
# Successfully finished at Fri Nov 11 18:34:44 2022.
# Starting optimization of 3363 parameters at Fri Nov 11 18:34:44 2022.
         step     time_sec         loss     fit_loss   reg_escape   reg_spreadreg_similarity reg_activity
            0     0.033837       420.35       383.63        32.98   3.4309e-32            0       3.7498
           32       1.2075       388.52        383.3      0.84563     0.066883            0       4.3132
# Successfully finished at Fri Nov 11 18:34:45 2022.


  for col_name, dtype in df.dtypes.iteritems():
