In [1]:
import pickle

import altair as alt

import pandas as pd

import polyclonal

import warnings
warnings.filterwarnings('ignore')

from IPython.utils import io

In [2]:
import os
os.chdir('../../')

In [3]:
# set up function for mean prob escape chart to avoid clutter from large block of code

def plot_avg_escape(prob_escape):
    max_aa_subs = 4  # group if >= this many substitutions
    
    mean_prob_escape = (
        prob_escape.assign(
            n_subs=lambda x: (
                x["aa_substitutions_reference"]
                .str.split()
                .map(len)
                .clip(upper=max_aa_subs)
                .map(lambda n: str(n) if n < max_aa_subs else f">{max_aa_subs - 1}")
            )
        )
        .groupby(["antibody_concentration", "n_subs"], as_index=False)
        .aggregate({"prob_escape": "mean", "prob_escape_uncensored": "mean"})
        .rename(
            columns={
                "prob_escape": "censored to [0, 1]",
                "prob_escape_uncensored": "not censored",
            }
        )
        .melt(
            id_vars=["antibody_concentration", "n_subs"],
            var_name="censored",
            value_name="probability escape",
        )
    )

    mean_prob_escape_chart = (
        alt.Chart(mean_prob_escape)
        .encode(
            x=alt.X("antibody_concentration"),
            y=alt.Y(
                "probability escape",
                scale=alt.Scale(type="symlog", constant=0.05),
            ),
            column=alt.Column("censored", title=None),
            color=alt.Color("n_subs", title="n substitutions"),
            tooltip=[
                alt.Tooltip(c, format=".3g") if mean_prob_escape[c].dtype == float else c
                for c in mean_prob_escape.columns
            ],
        )
        .mark_line(point=True, size=0.5)
        .properties(width=200, height=125)
        .configure_axis(grid=False)
    )

    return mean_prob_escape_chart

In [4]:
def generate_model(
    prob_escape_df,
    n_epitopes=1
):
    
    model = polyclonal.Polyclonal(
        n_epitopes=n_epitopes,
        data_to_fit=prob_escape_df.rename(
            columns={
                "antibody_concentration": "concentration",
                "aa_substitutions_reference": "aa_substitutions",
            }
        ),
        alphabet=polyclonal.AAS_WITHSTOP_WITHGAP,
    )

    # fit model, suppressing output text to avoid clutter in notebook
    with io.capture_output() as captured:
        opt_res = model.fit(
            logfreq=200,
            reg_escape_weight=0.1,
        )

    mut_escape_plot = model.mut_escape_plot(addtl_slider_stats={"times_seen": 3}, init_floor_at_zero=False)
#     mut_escape_plot = model.mut_escape_plot()
    return mut_escape_plot

In [5]:
prob_escape_2323 = pd.read_csv(
    "results/prob_escape/libA_230221_1_2323_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
assert prob_escape_2323.notnull().all().all()
prob_escape_2323.head()

Unnamed: 0,library,antibody_sample,no-antibody_sample,aa_substitutions_sequential,n_aa_substitutions,barcode,prob_escape,prob_escape_uncensored,antibody_count,no-antibody_count,antibody_neut_standard_count,no-antibody_neut_standard_count,total_no_antibody_count,no_antibody_count_threshold,aa_substitutions_reference,antibody,antibody_concentration
0,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,E69K K208L Q216T L263M R318S N464R,6,CAGTGGAATAGTAATA,0.3753,0.3753,37685,1285,6795275,86955,11688674,23,E50K K189L Q197T L244M R299S N445R,2323,0.0281
1,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,S73N K208E L263I,3,AACCAGGGGTAAAGAG,0.1841,0.1841,37512,2607,6795275,86955,11688674,23,S54N K189E L244I,2323,0.0281
2,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,K102H R111E K208E K297W,4,TTCTCCATTAAAGTAA,0.5495,0.5495,30448,709,6795275,86955,11688674,23,K83H R92E K189E K278W,2323,0.0281
3,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,I22D K140S A182Q K208G R288V E409N,6,TTAGTATGCCGATAAA,1.0,1.079,25550,303,6795275,86955,11688674,23,I3D K121S A163Q K189G R269V E390N,2323,0.0281
4,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,E69T K208G N235V Q382F T407S Q520G,6,GTCTAGAGTTTGATCT,0.2041,0.2041,22550,1414,6795275,86955,11688674,23,E50T K189G N216V Q363F T388S Q501G,2323,0.0281


In [6]:
plot_avg_escape(prob_escape_2323)

In [9]:
generate_model(prob_escape_2323)

In [10]:
prob_escape_2367 = pd.read_csv(
    "results/prob_escape/libA_230221_1_2367_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
assert prob_escape_2323.notnull().all().all()
plot_avg_escape(prob_escape_2367)

In [11]:
generate_model(prob_escape_2367)

In [12]:
prob_escape_2462 = pd.read_csv(
    "results/prob_escape/libA_230221_1_2462_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
assert prob_escape_2462.notnull().all().all()

plot_avg_escape(prob_escape_2462)

In [13]:
generate_model(prob_escape_2462)

In [18]:
prob_escape_2462_filtered = prob_escape_2462.loc[prob_escape_2462['antibody_concentration'] < 0.03]
plot_avg_escape(prob_escape_2462_filtered)

In [19]:
generate_model(prob_escape_2462_filtered)

In [14]:
prob_escape_2389 = pd.read_csv(
    "results/prob_escape/libA_230221_1_2389_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
assert prob_escape_2389.notnull().all().all()

plot_avg_escape(prob_escape_2389)

In [15]:
generate_model(prob_escape_2389)

In [16]:
prob_escape_2389_filtered = prob_escape_2389.loc[prob_escape_2389['antibody_concentration'] > 0.015]
plot_avg_escape(prob_escape_2389_filtered)

In [17]:
generate_model(prob_escape_2389_filtered)

In [6]:
prob_escape_2323 = pd.read_csv(
    "results/prob_escape/libA_230221_1_2323_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
# assert prob_escape_2323.notnull().all().all()
prob_escape_2323.head()

Unnamed: 0,library,antibody_sample,no-antibody_sample,aa_substitutions_sequential,n_aa_substitutions,barcode,prob_escape,prob_escape_uncensored,antibody_count,no-antibody_count,antibody_neut_standard_count,no-antibody_neut_standard_count,no_antibody_count_threshold,antibody_count_threshold,aa_substitutions_reference,retain,antibody,antibody_concentration
0,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,E69K K208L Q216T L263M R318S N464R,6,CAGTGGAATAGTAATA,0.3753,0.3753,37685,1285,6795275,86955,23,,E50K K189L Q197T L244M R299S N445R,True,2323,0.0281
1,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,S73N K208E L263I,3,AACCAGGGGTAAAGAG,0.1841,0.1841,37512,2607,6795275,86955,23,,S54N K189E L244I,True,2323,0.0281
2,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,K102H R111E K208E K297W,4,TTCTCCATTAAAGTAA,0.5495,0.5495,30448,709,6795275,86955,23,,K83H R92E K189E K278W,True,2323,0.0281
3,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,I22D K140S A182Q K208G R288V E409N,6,TTAGTATGCCGATAAA,1.0,1.079,25550,303,6795275,86955,23,,I3D K121S A163Q K189G R269V E390N,True,2323,0.0281
4,libA,230221_1_antibody_2323_0.028125_1,230221_1_no-antibody_control_1,E69T K208G N235V Q382F T407S Q520G,6,GTCTAGAGTTTGATCT,0.2041,0.2041,22550,1414,6795275,86955,23,,E50T K189G N216V Q363F T388S Q501G,True,2323,0.0281


In [7]:
prob_escape_2323_filtered = prob_escape_2323.loc[
    prob_escape_2323['antibody_concentration'] != 0.0125]

In [9]:
plot_avg_escape(prob_escape_2323_filtered)

In [10]:
prob_escape_2323_filtered_2 = prob_escape_2323.loc[
    prob_escape_2323['antibody_concentration'] != 0.0187]

In [11]:
prob_escape_2323_filtered_3 = prob_escape_2323.loc[
    (prob_escape_2323['antibody_concentration'] != 0.0187) &
    (prob_escape_2323['antibody_concentration'] != 0.0125)]

In [12]:
plot_avg_escape(prob_escape_2323_filtered_3)

In [13]:
generate_model(prob_escape_2323)

In [14]:
generate_model(prob_escape_2323_filtered)

In [15]:
generate_model(prob_escape_2323_filtered_2)

In [16]:
generate_model(prob_escape_2323_filtered_3)

In [17]:
prob_escape_3857 = pd.read_csv(
    "results/prob_escape/libA_230403_1_3857_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
# assert prob_escape_2323.notnull().all().all()
prob_escape_3857.head()

Unnamed: 0,library,antibody_sample,no-antibody_sample,aa_substitutions_sequential,n_aa_substitutions,barcode,prob_escape,prob_escape_uncensored,antibody_count,no-antibody_count,antibody_neut_standard_count,no-antibody_neut_standard_count,no_antibody_count_threshold,antibody_count_threshold,aa_substitutions_reference,retain,antibody,antibody_concentration
0,libA,230403_1_antibody_3857_0.062643_1,230403_1_no-antibody_control_1,K297I,1,ATAACACAAAAAAGTA,0.0,0.0,7290,336332,9979247,21938,24,,K278I,True,3857,0.0626
1,libA,230403_1_antibody_3857_0.062643_1,230403_1_no-antibody_control_1,R111S V366M R402S,3,TATCTACCTAACGAAA,0.0001,0.0001,5026,73990,9979247,21938,24,,R92S V347M R383S,True,3857,0.0626
2,libA,230403_1_antibody_3857_0.062643_1,230403_1_no-antibody_control_1,K208E R280F V366H,3,ACTCACACGAGGAAGA,0.079,0.079,4671,130,9979247,21938,24,,K189E R261F V347H,True,3857,0.0626
3,libA,230403_1_antibody_3857_0.062643_1,230403_1_no-antibody_control_1,L89I L263H Q520R,3,CTCTTTAAAATCCATT,0.0002,0.0002,3735,49036,9979247,21938,24,,L70I L244H Q501R,True,3857,0.0626
4,libA,230403_1_antibody_3857_0.062643_1,230403_1_no-antibody_control_1,D123H K208E,2,AAGCCACAAGGTACTA,0.0016,0.0016,3726,5257,9979247,21938,24,,D104H K189E,True,3857,0.0626


In [18]:
plot_avg_escape(prob_escape_3857)

In [23]:
prob_escape_3857_filtered = prob_escape_3857.loc[
    prob_escape_3857['antibody_concentration'] > 0.01]

In [24]:
plot_avg_escape(prob_escape_3857_filtered)

In [25]:
prob_escape_2343 = pd.read_csv(
    "results/prob_escape/libA_230317_1_2343_1_prob_escape.csv", keep_default_na=False, na_values="nan"
).query(
    "`no-antibody_count` >= no_antibody_count_threshold"
)  # filter for those with sufficient no-antibody counts
# assert prob_escape_2323.notnull().all().all()
prob_escape_2343.head()

Unnamed: 0,library,antibody_sample,no-antibody_sample,aa_substitutions_sequential,n_aa_substitutions,barcode,prob_escape,prob_escape_uncensored,antibody_count,no-antibody_count,antibody_neut_standard_count,no-antibody_neut_standard_count,no_antibody_count_threshold,antibody_count_threshold,aa_substitutions_reference,retain,antibody,antibody_concentration
0,libA,230317_1_antibody_2343_0.012516_1,230317_1_no-antibody_control_1,P122V S212D Q216K G405D,4,AGACCGGGACTCCTCA,0.2119,0.2119,26503,2218,4456957,79037,25,,P103V S193D Q197K G386D,True,2343,0.0125
1,libA,230317_1_antibody_2343_0.012516_1,230317_1_no-antibody_control_1,I211V S212D L263F V366M,4,GTCGTAGATTGAATGC,0.0786,0.0786,10174,2294,4456957,79037,25,,I192V S193D L244F V347M,True,2343,0.0125
2,libA,230317_1_antibody_2343_0.012516_1,230317_1_no-antibody_control_1,Q63V K154N R318M,3,GTCCCGCATCAGTTTC,0.0021,0.0021,9207,77381,4456957,79037,25,,Q44V K135N R299M,True,2343,0.0125
3,libA,230317_1_antibody_2343_0.012516_1,230317_1_no-antibody_control_1,F467W,1,TAAGATATAGGTAGAA,0.1932,0.1932,6100,560,4456957,79037,25,,F448W,True,2343,0.0125
4,libA,230317_1_antibody_2343_0.012516_1,230317_1_no-antibody_control_1,N27A S164T W253Y L263I N408S T435C,6,TAAAACGGGAATTTAA,0.3499,0.3499,5939,301,4456957,79037,25,,N8A S145T W234Y L244I N389S T416C,True,2343,0.0125


In [26]:
prob_escape_2343_filtered = prob_escape_2343.loc[
    prob_escape_2343['antibody_concentration'] > 0.004]

In [27]:
plot_avg_escape(prob_escape_2343_filtered)

In [28]:
generate_model(prob_escape_2343_filtered)