# Filter CSVs

This notebook filters CSV outputs from several analyses with filter settings typically used by the Bloom Lab. 

Import modules and set up output directories:

In [1]:
import os
import pandas as pd

if not os.path.exists('./results/filtered_csvs/'):
    os.makedirs('./results/filtered_csvs/')

First we will filter the BF520 Env data. The BF520 Env libraries have a higher mutation rate and fewer targeted mutations. Therefore, we will set the times_seen filter to 3 to require mutations to be seen in three different backgrounds. For the antibody selection data, we will filter any mutations with a functional effect less than -4, which are non-functional and have noisy scores. 

In [2]:
# BF520 filters
times_seen_filter = 3
func_effect_filter = -4

# Filter the functional effect csvs 
for effect in ['latent', 'func']:
    TZM_entry_effects = pd.read_csv(f'../HIV_Envelope_BF520_DMS_3BNC117_10-1074/results/func_effects/averages/TZM-bl_entry_{effect}_effects.csv')
    TZM_entry_effects_filtered = TZM_entry_effects.query('times_seen>=@times_seen_filter')
    TZM_entry_effects_filtered.to_csv(f'./results/filtered_csvs/BF520_Env_TZM-bl_entry_{effect}_effects_filtered.csv', index=False)

# Filter the antibody escape csvs
for antibody in ['3BNC117', '10-1074']:
    average_escape = pd.read_csv(f'../HIV_Envelope_BF520_DMS_3BNC117_10-1074/results/antibody_escape/averages/{antibody}_mut_effect.csv')
    average_escape = average_escape.merge(
        TZM_entry_effects_filtered[['site', 'wildtype', 'mutant', 'effect']],
        how='left',
        on=['site', 'wildtype', 'mutant'],
    )
    average_escape_filtered = (average_escape
                                       .query('times_seen>=@times_seen_filter')
                                       .query('effect>=@func_effect_filter')
                                       .drop(columns=['effect'])
                                      )
    average_escape_filtered.reset_index(drop=True).to_csv(f'./results/filtered_csvs/BF520_Env_{antibody}_mut_effects_filtered.csv', index=False)

Next we will filter the TRO11 Env data. The TRO11 Env libraries have a lower mutation rate and higher total targeted mutations. Therefore, we will set the times_seen filter to 2, because most variants have only one mutation and each mutation will be seen in fewer backgrounds. For the antibody selection data, we will again filter any mutations with a functional effect less than -4, which are non-functional and have noisy scores.

In [3]:
# TRO11 filters
times_seen_filter = 2
func_effect_filter = -4

# Filter the functional effect csvs 
for effect in ['latent', 'func']:
    TZM_entry_effects = pd.read_csv(f'./results/func_effects/averages/TZM-bl_entry_{effect}_effects.csv')
    TZM_entry_effects_filtered = TZM_entry_effects.query('times_seen>=@times_seen_filter')
    TZM_entry_effects_filtered.to_csv(f'./results/filtered_csvs/TRO11_Env_TZM-bl_entry_{effect}_effects_filtered.csv', index=False)

# Filter the antibody escape csvs
for antibody in ['3BNC117', '10-1074']:
    average_escape = pd.read_csv(f'./results/antibody_escape/averages/{antibody}_mut_effect.csv')
    average_escape = average_escape.merge(
        TZM_entry_effects_filtered[['site', 'wildtype', 'mutant', 'effect']],
        how='left',
        on=['site', 'wildtype', 'mutant'],
    )
    average_escape_filtered = (average_escape
                                       .query('times_seen>=@times_seen_filter')
                                       .query('effect>=@func_effect_filter')
                                       .drop(columns=['effect'])
                                      )
    average_escape_filtered.reset_index(drop=True).to_csv(f'./results/filtered_csvs/TRO11_Env_{antibody}_mut_effects_filtered.csv', index=False)