# Created filtered CSVs

In [1]:
# Imports 
import os
import yaml
import warnings
import pandas as pd

File paths for data:

In [2]:
# this cell is tagged as `parameters` for papermill parameterization
func_effects = None
func_effects_config = None
site_numbering_map = None

antibody_escape_dir = None
antibody_escape_config = None

filtered_antibody_csv_dir = None
filtered_func_csv_dir = None
filtered_func_effects = None

In [3]:
# # Uncomment for running interactive
# func_effects = "../results/func_effects/averages/HEK293T_entry_func_effects.csv"
# func_effects_config = "../data/func_effects_config.yml"
# site_numbering_map = "../data/site_numbering_map.csv"

# antibody_escape_dir = "../results/antibody_escape/averages/"
# antibody_escape_config = "../data/antibody_escape_config.yml"

# filtered_antibody_csv_dir = "../results/filtered_antibody_escape_CSVs/"
# filtered_func_csv_dir = "../results/filtered_func_effects_CSV/"
# filtered_func_effects = "../results/filtered_func_effects_CSV/HEK293T_filitered_entry_func_effects.csv"

Get filtered CSV for antibody escape

In [4]:
# Load config file and extract params
with open(antibody_escape_config) as f:
    antibody_config = yaml.safe_load(f)

print("Parameters to filter antibody escape data by:")
min_models = antibody_config["plot_hide_stats_default"]["functional effect"]["min_filters"]["times_seen"]
print(f"Minimum models default: {min_models}")
cell_entry_default = antibody_config["plot_hide_stats_default"]["functional effect"]["init"]
print(f"Minimum cell entry: {cell_entry_default}")
times_seen = antibody_config["avg_escape_plot_kwargs_default"]["addtl_slider_stats"]["times_seen"]
print(f"Minimum times seen: {times_seen}")
std = antibody_config["avg_escape_plot_kwargs_default"]["addtl_slider_stats"]["escape_std"]
print(f"Maximum STDEV: {std}")
escape_metric = antibody_config["avg_escape_plot_kwargs_default"]["avg_type"]
print(f"Escape score metric: {escape_metric}")

# Load site map
site_map = (
    pd.read_csv(site_numbering_map)
    .rename(columns={
        "reference_site" : "site",
    })
)

Parameters to filter antibody escape data by:
Minimum models default: 2
Minimum cell entry: -5
Minimum times seen: 2
Maximum STDEV: 4
Escape score metric: median


In [5]:
def get_filtered_csv(
    antibody_name,  
    func_effects_file, 
    min_times_seen, 
    min_func_score,  
    min_models,
    max_std,
    escape_metric,
    site_map,
    filtered_file_dir = None,
):
    """
    Function that filters and writes an antibody escape csv.
    """

    # Load data
    escape_df = pd.read_csv(antibody_escape_dir+antibody_name+"_mut_effect.csv")
    func_effects = pd.read_csv(func_effects_file)

    # Create mutation column to match antibody df
    # **
    # Pipeline does not filter functional scores 
    # prior to filtering antibody selections
    # **
    func_effects["site"] = func_effects["site"].astype(str)
    func_effects["mutation"] = func_effects["wildtype"] + func_effects["site"] + func_effects["mutant"]
    func_scores_filter = func_effects.loc[func_effects["effect"] >= min_func_score]["mutation"].tolist()

    # Filter escape df for min times seen
    # min fraction of models, and no stop codons
    escape_col = "escape_" + escape_metric
    escape_df = (
        escape_df.loc[
            (escape_df["times_seen"] >= min_times_seen)
            &
            (escape_df["n_models"] >= min_models)
            &
            (escape_df["mutant"] != "*")
            &
            (escape_df["escape_std"] <= std)
        ]
        .rename(columns={escape_col : "escape"})
        .reset_index(drop=True)
    )

    # Merge site map
    escape_df = (
        escape_df.merge(
            site_map,
            how="left",
            on=["site"],
            validate="many_to_one",
        )
    )

    # Natural sequence mutation
    escape_df["natural_sequence_mutation"] = (
        escape_df["wildtype"] + escape_df["natural_sequence_site"].astype(str) + escape_df["mutant"]
    )

    # Drop all but a few columns
    escape_df = escape_df[[
            "site",
            "natural_sequence_site",
            "sequential_site",
            "wildtype", 
            "mutant", 
            "mutation", 
            "natural_sequence_mutation",
            "escape",
    ]]

    # Mark mutations that are below functional cutoff
    # and replace measurements with NaN
    escape_df["poor_cell_entry"] = (
        escape_df["mutation"].apply(lambda x: False if x in func_scores_filter else True)
    )
    escape_df["escape"] = (
        escape_df.apply(lambda x: float("NaN") if (x["poor_cell_entry"] == True) else x["escape"], axis=1)
    )

    # Create a floored escape column
    escape_df["floored_escape"] = escape_df["escape"].clip(lower=0)

    # Write filtered escape to csv
    if filtered_file_dir != None:
        escape_df.to_csv(filtered_file_dir+antibody_name+"_filtered_mut_effects.csv", index=False)

In [6]:
antibodies = [
    "RVC20",
    "RVA122",
    "17C7",
    "RVC58",
    "CR4098",
    "CR57",
    "CTB012",
    "RVC68",
]

for antibody in antibodies:

    # Make output dir if doesn't exist
    if not os.path.exists(filtered_antibody_csv_dir):
        os.mkdir(filtered_antibody_csv_dir)
    
    get_filtered_csv(
        antibody,
        func_effects,
        times_seen,
        cell_entry_default,
        min_models,
        std,
        escape_metric,
        site_map,
        filtered_file_dir = filtered_antibody_csv_dir,
    )

Get filtered CSV for functional effects

In [7]:
# Load config file and extract params
with open(func_effects_config) as f:
    config = yaml.safe_load(f)
print("Parameters to filter functional effects data by:")
min_times_seen = config["avg_func_effect_shifts_default"]["plot_kwargs"]["addtl_slider_stats"]["times_seen"]
print(f"Minimum times seen: {min_times_seen}")

# Read CSV file
func_scores = pd.read_csv(func_effects)

# Merge site map
func_scores = (
    func_scores.merge(
        site_map,
        how="left",
        on=["site"],
        validate="many_to_one",
    )
)

# Natural sequence mutation
func_scores["natural_sequence_mutation"] = (
    func_scores["wildtype"] + func_scores["natural_sequence_site"].astype(str) + func_scores["mutant"]
)

# Filter for minimum selections, times seen
func_scores = (
    func_scores.loc[
        (
            (func_scores["times_seen"] >= min_times_seen)
        )
        |
        (func_scores["wildtype"] == func_scores["mutant"])
    ]
    .drop(columns=[
        "effect_std",
        "sequential_wt",
        "reference_wt",
        "times_seen",
        "n_selections",
    ])
    .reset_index(drop=True)
)

# Make output dir if doesn't exist
if not os.path.exists(filtered_func_csv_dir):
    os.mkdir(filtered_func_csv_dir)

# Save CSV file
func_scores.to_csv(filtered_func_effects, index=False)

Parameters to filter functional effects data by:
Minimum times seen: 2
