### Filter antibody escape
This notebook filters the antibody escape data based on parameters defined in the config file to only include high quality measurements.

In [None]:
import pandas as pd

In [None]:
# Input files
escape_df_path = snakemake.input.escape_df
entry_df_path = snakemake.input.entry_df

# Output files
escape_effects_filtered_path = snakemake.output.escape_effects_filtered
escape_effects_filtered_no_effect_cutoff_path = snakemake.output.escape_effects_filtered_no_effect_cutoff
escape_effects_filtered_mean_path = snakemake.output.escape_effects_filtered_mean
escape_effects_filtered_sum_path = snakemake.output.escape_effects_filtered_sum

# Parameters
TIMES_SEEN_AB = snakemake.params.times_seen_ab
MIN_FUNC_EFFECT = snakemake.params.min_func_effect
ESCAPE_STD_DEV = snakemake.params.escape_std_dev


In [None]:
# Read in data
mab_escape_df = pd.read_csv(escape_df_path)
entry_df = pd.read_csv(entry_df_path)

# Merge on site, wildtype, and mutant
merged_df = pd.merge(mab_escape_df, entry_df, how="left", on=["site", "wildtype", "mutant"], suffixes=("_ab", "_entry"))

# Filter data
# 1. Seen in at least TIMES_SEEN_AB antibodies
# 2. Effect on entry >= MIN_FUNC_EFFECT
# 3. Escape standard deviation <= ESCAPE_STD_DEV
# 4. Remove stop codons and deletions
filtered_escape = merged_df[
    (merged_df["times_seen_ab"] >= TIMES_SEEN_AB) &
    (merged_df["effect"] >= MIN_FUNC_EFFECT) &
    (merged_df["escape_std"] <= ESCAPE_STD_DEV) &
    (merged_df["mutant"] != "*") &
    (merged_df["mutant"] != "-") 
]
# Save filtered data
filtered_escape.round(3).to_csv(escape_effects_filtered_path, index=False)

# Filter data as above, but don't apply effect cutoff
filtered_escape_no_effect_cutoff = merged_df[
    (merged_df["times_seen_ab"] >= TIMES_SEEN_AB) &
    #(merged_df["effect"] >= MIN_FUNC_EFFECT) &
    (merged_df["escape_std"] <= ESCAPE_STD_DEV) &
    (merged_df["mutant"] != "*") &
    (merged_df["mutant"] != "-") 
]
# Save filtered data
filtered_escape_no_effect_cutoff.round(3).to_csv(escape_effects_filtered_no_effect_cutoff_path, index=False)

In [None]:
# Calculate sum and mean escape per site
mAb_filtered_sum = (
    filtered_escape.groupby(["site"])
    .agg({
        "escape_mean": "sum",
        "wildtype": "first"
    })
    .reset_index()
    .round(3)
    .rename(columns={"escape_mean": "sum_escape"})
)


mAb_filtered_mean = (
    filtered_escape.groupby(["site"])
    .agg({"escape_mean": "mean", "wildtype": "first"})
    .reset_index()
    .round(3)
    .rename(columns={"escape_mean": "mean_escape"})
)

# Save sum and mean escape per site
mAb_filtered_sum.to_csv(escape_effects_filtered_sum_path, index=False)
mAb_filtered_mean.to_csv(escape_effects_filtered_mean_path, index=False)